Have you ever had to try and process logs for a webserver? If you've tried using awstats or similar, you'll be aware that DNS resolving your logs is a horrendously slow process - there's just too many broken reverse lookups out there.
AWstats has the ability to use a local DNS cache file, to speed up it's resolution. This script was written to build a local cache file, and allow AWstats to do it's processing at a sensible speed.
For relative comparison, AWstats runs at approx 1700 lines/second without resolving. Approx 1000 lines/sec when using this cache file and approx 100 lines/sec if it's having to resolve DNS.
It's been set up to take a batch of files, and cache the IP addresses from all of them. This is so I can take the 30odd logs for a month, and then process them all at once without having to duplicate work.
The semaphore is the slow bit, since that's an 'exclusive' section. It's a lot quicker now that it was when I used a hash and delete()'ed each key from the hash as I resolved it.
#!/usr/local/bin/perl
use strict;
use warnings;
use IO::Socket::INET;
use threads;
use threads::shared;
use Net::DNS;
use Thread::Semaphore;
my %resolved : shared;
my %to_resolve;
my @list : shared;
my $count = 0;
my $count_array : shared;
my @threads;
my $semaphore = new Thread::Semaphore;
use constant THREAD_COUNT => 20;
open ( OUTPUT, ">cache.txt" ) or die $!;
#no error checking, just assumes that all the files listed are correct
+.
foreach my $file ( @ARGV )
{
print "starting with file $file\n";
open ( INPUT, $file ) or die $!;
print "reading in IP addresses...\n";
my $start_read = time();
$count = 0;
while (my $line = <INPUT> )
{
$count++;
my ( $address, @stuff ) = split(" ", $line);
$to_resolve{$address} = 1;
if ( $count % 32768 == 0 ) { print "$count\n" };
}
print "$count lines read in ", time() - $start_read, "s\n";
my $start_resolve = time();
@list = keys ( %to_resolve );
print "$#list ips to resolve\n";
$count_array=0;
for ( my $loop = 0; ($loop < THREAD_COUNT and $loop < $#list / 20);
+$loop ++ )
{
print "starting thread $loop...";
$threads[$loop] = threads -> new ( \&run_resolver, $loop, $start_r
+esolve );
print "done\n";
}
for ( my $loop = 0; ( $loop < THREAD_COUNT and $loop < $#list / 20 )
+; $loop++ )
{
print "waiting for thread $loop...";
$threads[$loop] -> join;
print "done.\n";
}
print "resolve of $#list ip addresses complete in ", time() - $start
+_resolve,"s\n";
print "writing file...\n";
my $start_write = time();
foreach my $address ( keys ( %resolved ) )
{
print OUTPUT time(), " ", $address, " ", $resolved{$address},"\n";
}
print "output file written in ", time() - $start_write, "s\n";
close ( INPUT );
}#foreach
close ( OUTPUT );
sub run_resolver
{
my ( $number, $start_time ) = @_;
my $resolver = new Net::DNS::Resolver;
my $query;
my $addr;
my $finished = 0;
my $count = 0;
sleep 1;
#while ( $addr = (keys ( %to_resolve ) )[0] )
while ( not $finished )
{
$count++;
#print "$number waiting for lock\n";
$semaphore -> down;
#print "$number got lock\n";
unless ( $addr = $list[$count_array++] )
{ $finished = 1; $semaphore -> up; next;}
#print "$number checking for ip validity..\n";
unless ( $addr =~ m/\d+\.\d+\.\d+\.\d+/ )
{
print "\'$addr\' is invalid, releasing lock and skipping.\n";
$semaphore -> up;
next;
}
#print "$number got addr of $addr $count_array / $#list\n";
$semaphore -> up;
#print "$number releasing lock\n";
#print "$number finding $addr...\n";
$query = $resolver -> search ( $addr );
if ( $query )
{
$resolved{$addr} = $addr;
foreach my $RR ( $query -> answer )
{
next unless $RR -> type eq "PTR";
$resolved{$addr} = $RR -> rdatastr;
}
}
else
{
$resolved{$addr} = $addr;
}
#if ( $resolved{$addr} )
# { print "-> $number $addr = $resolved{$addr}, $count ($count_ar
+ray / $#list )\n";}
if ( $count_array % 1024 == 0 ) { print "$number: ", time() - $star
+t_time , "s :$count_array / $#list resolved.\n"; }
} # while
}