You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

redirector.pl.in 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. #!/usr/bin/env perl
  2. use warnings;
  3. use strict;
  4. # Required ports:
  5. # www/p5-POE-Component-Client-HTTP
  6. # www/p5-POE-Component-Server-HTTP
  7. # dns/p5-POE-Component-Client-DNS
  8. # databases/p5-Cache-Memcached-Fast
  9. # devel/p5-Proc-Daemon
  10. # sysutils/p5-Proc-PidUtil
  11. # POE::Component::Client::HTTP uses HTTP::Request and response
  12. # objects.
  13. use POSIX qw(strftime);
  14. use HTTP::Request::Common qw(GET POST);
  15. use HTTP::Response;
  16. use URI::Escape qw(uri_unescape);
  17. use Sys::Syslog qw/:standard :macros setlogsock/;
  18. use HTML::HeadParser;
  19. use Digest;
  20. use Proc::Daemon;
  21. use Proc::PidUtil;
  22. use POE qw(@POE_LOOP@ Component::Server::TCP Filter::HTTPD Component::Client::HTTP);
  23. use Cache::Memcached::Fast;
  24. my $with_swf = 1;
  25. my $swf_parser;
  26. my $saved_swf_url = "";
  27. eval "require SWF::Element" or $with_swf = 0; # p5-SWF-File
  28. my $DEBUG = grep { $_ eq '-debug' } @ARGV;
  29. our %cfg = (
  30. port => 8080,
  31. max_size => 102400,
  32. http_timeout => 5,
  33. max_rec => 5,
  34. pidfile => '/tmp/redirector.pid',
  35. do_log => 0,
  36. debug => 0,
  37. memcached_servers => [ { address => 'localhost:11211', weight => 2.5 }, ],
  38. facility => LOG_LOCAL3, # syslog facility
  39. log_level => LOG_INFO,
  40. digest_bits => 256,
  41. cache_expire => 3600,
  42. user => '@RSPAMD_USER@',
  43. group => '@RSPAMD_GROUP@',
  44. cfg_file => '@CMAKE_INSTALL_PREFIX@/etc/rspamd-redirector.conf',
  45. );
  46. our $do_reopen_log = 0;
  47. our $memd;
  48. ############################################ Subs ########################################
  49. # Read file into string
  50. sub read_file {
  51. my ($file) = @_;
  52. open( IN, $file ) or _log( LOG_ALERT, "Can't open $file: $!" );
  53. local $/;
  54. my $content = <IN>;
  55. close IN;
  56. return $content;
  57. }
  58. # Write log line:
  59. sub _log {
  60. my ( $l, $w, @s ) = @_;
  61. if ($DEBUG) {
  62. printf STDERR $w . "\n", @s;
  63. }
  64. else {
  65. syslog( $l, $w . "\n", @s ) if ( $l <= $cfg{'log_level'} );
  66. }
  67. if ( $l == LOG_ALERT ) {
  68. die $w;
  69. }
  70. }
  71. # Init swf parser
  72. sub swf_init_parser {
  73. if ($with_swf) {
  74. $swf_parser = SWF::Parser->new( 'tag-callback' => \&swf_tag_callback );
  75. }
  76. }
  77. # Checking for SWF url
  78. sub swf_search_get_url {
  79. my $actions = shift;
  80. my $saved_pool_str = "";
  81. for my $action (@$actions) {
  82. if ( $action->tag_name eq 'ActionConstantPool' ) {
  83. my $pool = $action->ConstantPool;
  84. for my $string (@$pool) {
  85. if ( $string =~ /^https?:\/\// ) {
  86. $saved_pool_str = $string->value;
  87. }
  88. }
  89. }
  90. elsif ( $action->tag_name eq 'ActionGetURL2' ) {
  91. if ( $saved_pool_str ne "" ) {
  92. $saved_swf_url = $saved_pool_str;
  93. }
  94. }
  95. elsif ( $action->tag_name =~ 'ActionGetURL' ) {
  96. $saved_swf_url = $action->UrlString->value;
  97. }
  98. }
  99. }
  100. # SWF check tag utility
  101. sub swf_check_tag {
  102. my ( $t, $stream ) = @_;
  103. my ($tagname) = $t->tag_name;
  104. for ($tagname) {
  105. ( /^Do(Init)?Action$/ or /^DefineButton$/ ) and do {
  106. swf_search_get_url( $t->Actions );
  107. last;
  108. };
  109. /^PlaceObject2$/ and do {
  110. for my $ca ( @{ $t->ClipActions } ) {
  111. swf_search_get_url( $ca->Actions );
  112. }
  113. last;
  114. };
  115. /^DefineButton2$/ and do {
  116. for my $ba ( @{ $t->Actions } ) {
  117. swf_search_get_url( $ba->Actions );
  118. }
  119. last;
  120. };
  121. /^DefineSprite$/ and do {
  122. for my $tag ( @{ $t->ControlTags } ) {
  123. swf_search_get_url( $tag, $stream );
  124. }
  125. last;
  126. };
  127. }
  128. }
  129. # Callback for swf parser
  130. sub swf_tag_callback {
  131. my ( $self, $tag, $length, $stream ) = @_;
  132. my $t = SWF::Element::Tag->new( Tag => $tag, Length => $length );
  133. my ($tagname) = $t->tag_name;
  134. return
  135. unless $tagname eq 'DoAction'
  136. or $tagname eq 'DoInitAction'
  137. or $tagname eq 'PlaceObject2'
  138. or $tagname eq 'DefineButton'
  139. or $tagname eq 'DefineButton2'
  140. or $tagname eq 'DefineSprite';
  141. if ( $tagname eq 'DefineSprite' ) {
  142. # Tags in the sprite are not unpacked here.
  143. $t->shallow_unpack($stream);
  144. $t->TagStream->parse( callback => \&swf_tag_callback );
  145. return;
  146. }
  147. elsif ( $tagname eq 'PlaceObject2' ) {
  148. # Most of PlaceObject2 tags don't have ClipActions.
  149. $t->lookahead_Flags($stream);
  150. return unless $t->PlaceFlagHasClipActions;
  151. }
  152. # unpack the tag and search actions.
  153. $t->unpack($stream);
  154. swf_check_tag($t);
  155. }
  156. # Check url from memcached cache first
  157. sub memcached_check_url {
  158. my ($url) = @_;
  159. my $context = Digest->new("SHA-256");
  160. $context->add($url);
  161. return $memd->get( $context->digest() );
  162. }
  163. # Write url to memcached key
  164. sub memcached_cache_url {
  165. my ( $url, $url_real ) = @_;
  166. if ( $url ne $url_real ) {
  167. my $context = Digest->new("SHA-256");
  168. $context->add($url);
  169. if (!$memd->set( $context->digest(), $url_real, $cfg{cache_expire} )) {
  170. _log(LOG_INFO, "cannot save redirect from $url to $url_real in memcached");
  171. }
  172. }
  173. }
  174. sub create_response {
  175. my ( $code, $uri ) = @_;
  176. my $new_response;
  177. if ($uri) {
  178. $new_response = HTTP::Response->new( $code, 'OK' );
  179. $new_response->header( "Uri", $uri );
  180. $new_response->content($uri);
  181. $new_response->content_length( length($uri) );
  182. }
  183. else {
  184. $new_response = HTTP::Response->new($code);
  185. $new_response->content_length(0);
  186. }
  187. $new_response->header( "Connection", "Close" );
  188. $new_response->header( "Proxy-Connection", "Close" );
  189. return $new_response;
  190. }
  191. # POE http client callback
  192. sub process_client {
  193. my ( $kernel, $heap ) = @_[ KERNEL, HEAP ];
  194. my $http_request = $_[ARG0]->[0];
  195. my $rec = $_[ARG0]->[1][0];
  196. my $http_response = $_[ARG1]->[0];
  197. my $base_url = $_[ARG0]->[1][1];
  198. $saved_swf_url = "";
  199. if ( $rec == 0 ) {
  200. $base_url = $http_request->uri;
  201. }
  202. else {
  203. # Check cache for each url
  204. my $redirect = memcached_check_url( $http_request->uri );
  205. if ($redirect) {
  206. _log( LOG_INFO,
  207. "Memcached redirect from %s to %s for request from: %s",
  208. $http_response->base, $redirect, $heap->{remote_ip} );
  209. my $new_response = create_response( 200, $redirect );
  210. # Avoid sending the response if the client has gone away.
  211. $heap->{client}->put($new_response) if defined $heap->{client};
  212. # Shut down the client's connection when the response is sent.
  213. return;
  214. }
  215. }
  216. if ($do_reopen_log) {
  217. $do_reopen_log = 0;
  218. reopen_log();
  219. }
  220. if ( $rec > $cfg{max_rec} ) {
  221. _log( LOG_INFO,
  222. "Max recursion exceeded: %d from %s to %s for request from: %s",
  223. $rec, $base_url, $http_request->uri, $heap->{remote_ip} );
  224. # Write to cache
  225. memcached_cache_url( $base_url, $http_request->uri );
  226. my $new_response = create_response( 200, $http_request->uri );
  227. # Avoid sending the response if the client has gone away.
  228. $heap->{client}->put($new_response) if defined $heap->{client};
  229. # Shut down the client's connection when the response is sent.
  230. $kernel->yield("shutdown");
  231. return;
  232. }
  233. # Detect HTTP redirects
  234. if ( $http_response->is_redirect ) {
  235. my $redirect = $http_response->header('Location');
  236. if ($redirect) {
  237. if ( $redirect =~ /^https?:\/\// ) {
  238. _log( LOG_INFO,
  239. "HTML redirect from %s to %s for request from: %s",
  240. $http_response->base, $redirect, $heap->{remote_ip} );
  241. my $request = HTTP::Request->new( 'GET', $redirect );
  242. $request->header( "Connection", "close" );
  243. $request->header( "Proxy-Connection", "close" );
  244. $kernel->post( "cl", "request", "got_response", $request,
  245. [ $rec + 1, $base_url ] );
  246. return;
  247. }
  248. else {
  249. _log( LOG_INFO,
  250. "ignoring internal redirect from %s to %s for request from: %s",
  251. $http_request->uri,
  252. $redirect,
  253. $heap->{remote_ip}
  254. );
  255. my $new_response = create_response( 200, $http_request->uri );
  256. # Avoid sending the response if the client has gone away.
  257. $heap->{client}->put($new_response) if defined $heap->{client};
  258. # Shut down the client's connection when the response is sent.
  259. $kernel->yield("shutdown");
  260. return;
  261. }
  262. }
  263. }
  264. elsif ( $http_response->code != 200 ) {
  265. _log( LOG_INFO, "HTTP response was %d, for request to %s",
  266. $http_response->code, $http_request->uri );
  267. my $new_response;
  268. if ($rec == 0) {
  269. $new_response = create_response( $http_response->code );
  270. }
  271. else {
  272. memcached_cache_url( $base_url, $http_request->uri );
  273. $new_response = create_response( 200, $http_request->uri );
  274. }
  275. # Avoid sending the response if the client has gone away.
  276. $heap->{client}->put($new_response) if defined $heap->{client};
  277. # Shut down the client's connection when the response is sent.
  278. $kernel->yield("shutdown");
  279. return;
  280. }
  281. my $response_type = $http_response->content_type();
  282. if ( $response_type =~ /^text/i ) {
  283. my $content = $http_response->decoded_content();
  284. my $p = HTML::HeadParser->new($http_response);
  285. $p->parse($content);
  286. my $expire = $http_response->header('Refresh');
  287. if ( $http_response->is_redirect || $expire ) {
  288. my $redirect;
  289. if ($expire) {
  290. $expire =~ /URL=(\S+)/;
  291. $redirect = $1;
  292. }
  293. else {
  294. $redirect = $http_response->header('Location');
  295. }
  296. if ($redirect) {
  297. if ( $redirect =~ /^https?:\/\// ) {
  298. _log( LOG_INFO,
  299. "HTML redirect from %s to %s for request from: %s",
  300. $http_response->base,
  301. $redirect,
  302. $heap->{remote_ip}
  303. );
  304. my $request = HTTP::Request->new( 'GET', $redirect );
  305. $request->header( "Connection", "close" );
  306. $request->header( "Proxy-Connection", "close" );
  307. $kernel->post( "cl", "request", "got_response", $request,
  308. [ $rec + 1, $base_url ] );
  309. return;
  310. }
  311. else {
  312. _log( LOG_INFO,
  313. "ignoring internal redirect from %s to %s for request from: %s",
  314. $http_response->base,
  315. $redirect,
  316. $heap->{remote_ip}
  317. );
  318. }
  319. }
  320. }
  321. if ( $content =~ /location\s*=\s*["']*(https?:\/\/[^"'\s]+)["']*/im ) {
  322. my $redir = uri_unescape($1);
  323. _log( LOG_INFO, "js redirect from %s to %s for request from: %s",
  324. $http_response->base, $1, $heap->{remote_ip} );
  325. my $request = HTTP::Request->new( 'GET', $redir );
  326. $request->header( "Connection", "close" );
  327. $request->header( "Proxy-Connection", "close" );
  328. $kernel->post( "cl", "request", "got_response", $request,
  329. [ $rec + 1, $base_url ] );
  330. return;
  331. }
  332. }
  333. elsif (
  334. $with_swf
  335. && (
  336. $response_type eq 'application/x-shockwave-flash'
  337. || ( $http_request->uri =~ /\.swf(\?.*)?$/i
  338. && $http_response->code == 200 )
  339. )
  340. )
  341. {
  342. my $content = $http_response->decoded_content();
  343. $swf_parser->parse($content);
  344. if ( $saved_swf_url ne "" ) {
  345. _log( LOG_INFO, "flash redirect from %s to %s for request from: %s",
  346. $http_response->base, $saved_swf_url, $heap->{remote_ip} );
  347. my $request = HTTP::Request->new( 'GET', $saved_swf_url );
  348. # Reset swf redirect global variable
  349. $saved_swf_url = "";
  350. $request->header( "Connection", "close" );
  351. $request->header( "Proxy-Connection", "close" );
  352. $kernel->post( "cl", "request", "got_response", $request,
  353. [ $rec + 1, $base_url ] );
  354. return;
  355. }
  356. }
  357. else {
  358. _log( LOG_INFO,
  359. "response wasn't text request from: %s, response is: %s",
  360. $heap->{remote_ip}, $response_type );
  361. }
  362. _log( LOG_INFO, "redirect from %s to %s for request from: %s",
  363. $base_url, $http_request->uri, $heap->{remote_ip} );
  364. # Write to cache
  365. memcached_cache_url( $base_url, $http_request->uri );
  366. my $new_response =
  367. create_response( $http_response->code, $http_request->uri );
  368. # Avoid sending the response if the client has gone away.
  369. $heap->{client}->put($new_response) if defined $heap->{client};
  370. # Shut down the client's connection when the response is sent.
  371. $kernel->yield("shutdown");
  372. }
  373. sub process_input {
  374. my ( $kernel, $heap, $request ) = @_[ KERNEL, HEAP, ARG0 ];
  375. if ( $request->isa("HTTP::Response") ) {
  376. $heap->{client}->put($request);
  377. $kernel->yield("shutdown");
  378. return;
  379. }
  380. my $domain;
  381. if ( $request->uri =~ /^http:\/\/([^\/]+)\// ) {
  382. my @parts = split( /\./, $1 );
  383. my $c1 = pop @parts;
  384. my $c2 = pop @parts;
  385. $domain = "$c2.$c1";
  386. }
  387. if (
  388. (
  389. defined( $cfg{check_regexp} ) && $request->uri !~ $cfg{check_regexp}
  390. )
  391. || ( defined( $cfg{check_domains} )
  392. && scalar( grep { $_ eq $domain } @{ $cfg{check_domains} } ) == 0 )
  393. )
  394. {
  395. my $new_response = create_response( 200, $request->uri );
  396. # Avoid sending the response if the client has gone away.
  397. $heap->{client}->put($new_response) if defined $heap->{client};
  398. $kernel->yield("shutdown");
  399. # Shut down the client's connection when the response is sent.
  400. return;
  401. }
  402. # Check cache first
  403. my $redirect = memcached_check_url( $request->uri );
  404. if ($redirect) {
  405. _log( LOG_INFO, "Memcached redirect from %s to %s for request from: %s",
  406. $request->uri, $redirect, $heap->{remote_ip} );
  407. my $new_response = create_response( 200, $redirect );
  408. # Avoid sending the response if the client has gone away.
  409. $heap->{client}->put($new_response) if defined $heap->{client};
  410. $kernel->yield("shutdown");
  411. # Shut down the client's connection when the response is sent.
  412. return;
  413. }
  414. # Start http request
  415. my $new_request = HTTP::Request->new( 'GET', $request->uri );
  416. $new_request->header( "Connection", "close" );
  417. $new_request->header( "Proxy-Connection", "close" );
  418. $kernel->post( "cl", "request", "got_response", $new_request, [ 0, "" ] );
  419. }
  420. sub sig_DIE {
  421. my ( $sig, $ex ) = @_[ ARG0, ARG1 ];
  422. _log( LOG_ERR, "$$: error in $ex->{event}: $ex->{error_str}" );
  423. $poe_kernel->sig_handled();
  424. # Send the signal to session that sent the original event.
  425. if ( $ex->{source_session} ne $_[SESSION] ) {
  426. $poe_kernel->signal( $ex->{source_session}, 'DIE', $sig, $ex );
  427. }
  428. }
  429. sub sig_CLD {
  430. my ( $heap, $child_pid ) = @_[ HEAP, ARG1 ];
  431. return 0;
  432. }
  433. ############################### Main code fragment ##################################
  434. # Do daemonization
  435. if ( !$DEBUG ) {
  436. Proc::Daemon::Init;
  437. POE::Kernel->has_forked;
  438. setlogsock('unix');
  439. openlog( 'redirector', 'ndelay,pid', $cfg{'facility'} );
  440. }
  441. # Try to eval config file
  442. if ( -f $cfg{cfg_file} ) {
  443. my $config = read_file( $cfg{cfg_file} );
  444. eval $config;
  445. }
  446. _log( LOG_ALERT, "Process is already started, check $cfg{pidfile}" )
  447. if Proc::PidUtil::is_running( $cfg{pidfile} );
  448. # Drop privilleges
  449. if ( $> == 0 ) {
  450. my $uid = getpwnam( $cfg{user} )
  451. or _log( LOG_ALERT, "user $cfg{user} unknown" );
  452. my $gid = getgrnam( $cfg{group} )
  453. or _log( LOG_ALERT, "group $cfg{group} unknown" );
  454. $< = $> = $uid;
  455. $) = $( = $gid;
  456. }
  457. if ( !$DEBUG ) {
  458. _log( LOG_ALERT, "Cannot write to pidfile $cfg{pidfile}" )
  459. if !open( PID, "> $cfg{pidfile}" );
  460. close(PID);
  461. }
  462. # Reopen log on SIGUSR1
  463. $poe_kernel->sig( DIE => \&sig_DIE );
  464. $poe_kernel->sig( CLD => \&sig_CLD );
  465. $SIG{USR1} = sub { $do_reopen_log = 1; $poe_kernel->sig_handled(); };
  466. $SIG{INT} = sub { $poe_kernel->stop(); };
  467. $SIG{QUIT} = sub { $poe_kernel->stop(); };
  468. $SIG{PIPE} = 'IGNORE';
  469. if ( !$DEBUG ) {
  470. Proc::PidUtil::make_pidfile( $cfg{pidfile}, $$ )
  471. or _log( LOG_ALERT, "Cannot write pidfile $cfg{pidfile}" );
  472. }
  473. # Init memcached connection
  474. _log( LOG_INFO, "Starting memcached connection" );
  475. $memd = new Cache::Memcached::Fast(
  476. {
  477. servers => $cfg{memcached_servers},
  478. connect_timeout => 0.2,
  479. io_timeout => 0.5,
  480. max_failures => 3,
  481. failure_timeout => 2,
  482. ketama_points => 150,
  483. hash_namespace => 1,
  484. serialize_methods => [ \&Storable::freeze, \&Storable::thaw ],
  485. utf8 => ( $^V ge v5.8.1 ? 1 : 0 ),
  486. }
  487. );
  488. # POE part
  489. POE::Component::Client::HTTP->spawn(
  490. Alias => 'cl',
  491. MaxSize => $cfg{max_size}, # Remove for unlimited page sizes
  492. Timeout => $cfg{http_timeout},
  493. ConnectionManager => POE::Component::Client::Keepalive->new(
  494. max_per_host => 256,
  495. max_open => 1024,
  496. keep_alive => 1,
  497. timeout => $cfg{http_timeout},
  498. ),
  499. );
  500. _log( LOG_INFO, "Starting HTTP server" );
  501. POE::Component::Server::TCP->new(
  502. Alias => "",
  503. Port => $cfg{port},
  504. ClientFilter => 'POE::Filter::HTTPD',
  505. ClientInput => \&process_input,
  506. InlineStates => { got_response => \&process_client, },
  507. );
  508. swf_init_parser();
  509. _log( LOG_NOTICE, "Starting URL resolver" );
  510. # Start POE. This will run the server until it exits.
  511. POE::Kernel->run();
  512. exit 0;
  513. ############################## Final block ####################################
  514. END {
  515. _log( LOG_NOTICE, 'redirector stopped' );
  516. closelog();
  517. }