#!/usr/bin/perl use strict; use warnings; use DBI; use Data::Dumper; use PerlIO::gzip; use List::Util('first'); use POSIX qw[WNOHANG EAGAIN]; use Cache::FastMmap; my $LOGDIR='/var/log'; chdir($LOGDIR) or die "Could not chdir('$LOGDIR') - $!"; my $dirglob=$ARGV[0] || '*access_log*gz'; chomp(my @access_logs=glob($dirglob)); die "No files found for $dirglob glob" unless( @access_logs ); die "Too many files found for $dirglob glob" if ( @access_logs > 1020 +); # see get_num_pages my $cache_db = &get_wanted_domains; # now parse the logs my $cache_logs = Cache::FastMmap->new( init_file=>1, raw_values=>1, cache_not_found=>1, share_file=>'/tmp/cache_logs', # cache_size=>'5m', num_pages=>&get_num_pages(scalar( @access_logs ) ), ); my %kids; my $MAX_KIDS=30; ACCESS_LOGS: for my $access_log( @access_logs ) { # &wait_on_kids; # uncommenting this makes it consistent everyti +me my ($host)=$access_log=~/^([^\.]*)\./; $host=~s/_//; my ($date)=$access_log=~/(\d{8})/; my $fh=$host.'_'.$date; my $kid; if ( $kid=fork ) { $kids{$kid}++; warn "new \$kid=$kid, \$fh=$fh"; } elsif ( defined $kid ) { my $access_log_fh=do{ no strict 'refs'; \*{ $fh }; }; my %uniqs; open($access_log_fh,'<:gzip',$access_log) or die "Could not o +pen $access_log - $!\n"; while ( my $line=<$access_log_fh> ) { chomp( $line ); my $domain; # if we find a domain.html?domain=something.com entry i +n the log if ( $line=~m/domain\.html/ and ($domain)=$line=~m/doma +in=([\w\.\-_]*)\W/ ) { # if it's a wanted domain if ( my $is_wanted=$cache_db->get( $domain ) ) { $cache_logs->get_and_set($fh,sub{return ++$_[1];}); $uniqs{$domain}++; } } } close $access_log_fh; exit(0); } elsif ( EAGAIN == $! ) { warn "Couldnt fork for $fh - $!"; sleep 1; redo ACCESS_LOGS; } } # reap kids print "WAITING FOR KIDS\n"; &reap_kids; print "KIDS DONE\n"; # display the total hits # split the date from the host name my %results; for my $hostdate( sort $cache_logs->get_keys(0) ) { my(undef,$date)=$hostdate=~/^([^_]*_)(.*)/; my $hits=$cache_logs->get($hostdate); print "HOSTDATE: $hostdate, HITS: $hits\n"; $results{$date}+=$hits; } for my $date ( sort keys %results ) { print "DATE: $date, count: $results{$date} \n"; } sub get_wanted_domains { # actually does DB stuff, but return [ qw[abc.com efg.com hij.com] ]; } sub reap_kids { while ((my $kid = waitpid( -1, WNOHANG )) != -1) { print "\$kid $kid reaped\n" if $kid; select(undef,undef,undef,.01); } } sub wait_on_kids { while( scalar( keys %kids ) > $MAX_KIDS && (my $kid = waitpid( + -1, WNOHANG)) != -1 ) { if ( $kid ) { print "\$kid $kid waited on\n"; delete $kids{$kid}; # warn Dumper(\%kids); } select(undef,undef,undef,.01); } } sub get_num_pages { my $num_logs=shift; # from http://www.prime-numbers.org/prime-number-000-1024.htm my @primes=qw[ 2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 71 73 79 83 89 97 101 103 107 109 113 127 131 137 139 149 151 157 163 167 173 179 181 191 193 197 199 211 223 227 229 233 239 241 251 257 263 269 271 277 281 283 293 307 311 313 317 331 337 347 349 353 359 367 373 379 383 389 397 401 409 419 421 431 433 439 443 449 457 461 463 467 479 487 491 499 503 509 521 523 541 547 557 563 569 571 577 587 593 599 601 607 613 617 619 631 641 643 647 653 659 661 673 677 683 691 701 709 719 727 733 739 743 751 757 761 769 773 787 797 809 811 821 823 827 829 839 853 857 859 863 877 881 883 887 907 911 919 929 937 941 947 953 967 971 977 983 991 997 1009 1013 1019 1021]; return first{$_>$num_logs}@primes; }
In reply to Too many children makes Cache::FastMmap inconsistent by bennymack
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |