in reply to Optimise the script
Given that your Anonymous, I doubt this result will ever be found. However, in some of my spare time I put together a script that indexes the apache log file like I suggested in my post to you
I also converted the script to use DateTime instead of Date::Manip as the former is faster.
Note, the script took about 41 minutes per gig of data on my development machine, but then takes an infinitesimal amount of time for any subsequent run given the data is ordered and indexed.
#!/usr/bin/perl use DateTime; use Fcntl qw(:seek); use strict; use warnings; my $infile = 'access_log'; my $indexfile = $infile . '.pst'; my $outfile = 'file.txt'; my %mon = do { my $i = 1; map {$_ => $i++} qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov De +c) }; my $search_date = DateTime->now()->subtract(days => 10)->strftime("%Y% +m%d"); print "Search date is $search_date\n"; # Get Last Index Location my @index_last = ('', 0); my @index_start; my @index_stop; if (-e $indexfile) { open my $fh, $indexfile or die "$indexfile: $!"; while (<$fh>) { chomp; @index_last = split "\t"; @index_stop = @index_last if @index_start && !@index_stop; @index_start = @index_last if $index_last[0] eq $search_date; } } open my $oh, '>', $outfile or die "$outfile: $!"; open my $ih, $infile or die "$infile: $!"; my ($lastday, $index) = @index_start ? @index_start : @index_last; seek $ih, $index, SEEK_SET; while (<$ih>) { my $day; # If w/i indexes, no need to reparse day if (@index_stop) { # End reached last if $index >= $index_stop[1]; $day = $search_date; # Parse Date } elsif (m{\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+)\s+([+-]\d+)}) { my $dt = DateTime->new( year => $3, month => $mon{$2}, day => $1, hour => $4, minute => $5, second => $6, time_zone => $7, ); $dt->set_time_zone('America/Los_Angeles'); $day = $dt->strftime("%Y%m%d"); } else { warn "Invalid date on: $_"; next; } # New Date if ($day ne $lastday) { # Add to index if necessary if ($day > $index_last[0]) { @index_last = ($day, $index); open my $oh, '>>', $indexfile or die "$indexfile: $!"; print $oh join("\t", @index_last), "\n"; close $oh; } # End if past search date last if $day > $search_date; print "Processing $day, $index on " . scalar(localtime) . "\n" +; $lastday = $day; } # Matches search date if ($day eq $search_date) { # Do whatever print $oh $_; } $index = tell $ih; } close $ih; __END__
|
|---|