Apologies for this rather OT post. We are currently invloved in rolling out a Perl based Web filtration system into the UK market. One of the required tasks is to validate the coverage of the existing classification DB. The most efficient way to do this is via access_log analysis ie we need to ensure we have full coverage of the most commonly visited sites in the UK.
We have a little Perl squid log parser that generates pretty anonymous data like:
# 1675147 http://download.windowsupdate.com/
# 531450 http://ads.icq.com/
# 472384 http://direct.ninemsn.com.au/
# 299397 http://rad.msn.com/
# 255706 http://web.icq.com/
# 255578 http://www.google.com.au/
If there are any monks who work for UK ISPs I would love to hear from you about who we should contact to try to arrange access to this data. Finding the right person to talk to is proving a little more difficult than expected.
cheers
tachyon
#!/usr/bin/perl
use strict;
my $SQUID_LOG_LOC = '/var/log/squid';
my $OUTFILE = '/tmp/log_parse.txt';
my $GZIP = '/bin/gzip';
my $GUNZIP = '/bin/gunzip';
my (%urls, @files);
# find the logfiles
while (1) {
if ( -e $SQUID_LOG_LOC and -d $SQUID_LOG_LOC and -r $SQUID_LOG_LOC
+ ) {
@files = grep { /access.log(?:\.\d+)?(?:\.gz)?$/i } glob ("$SQ
+UID_LOG_LOC/*");
}
else {
# take a punt on the other likely location of the squid logs
# transparent to user....
do { $SQUID_LOG_LOC = '/usr/local/squid/var/logs'; next } if $
+SQUID_LOG_LOC eq '/var/log/squid';
print "$SQUID_LOG_LOC does not exist!\n" unless -e $SQUID_LOG_
+LOC;
print "$SQUID_LOG_LOC is not a directory!\n" unless -d $SQUID_
+LOG_LOC;
print "$SQUID_LOG_LOC is not readable! Do you need to su?\n"
+unless -r $SQUID_LOG_LOC;
}
last if @files;
print "Did not find any squid logs in $SQUID_LOG_LOC.\nPlease Ente
+r Path (Enter=Exit) ";
chomp( $SQUID_LOG_LOC = <> );
exit unless $SQUID_LOG_LOC;
$SQUID_LOG_LOC =~ s!/\s*$!!;
}
# get a filehandle to ouput to
while ( 1 ) {
last if open OUT, ">$OUTFILE";
print "\nCan't write output file $OUTFILE\nPerl says: $!\n";
print "Where can I write to (Full Path, Enter=Exit)? ";
chomp ( $OUTFILE = <> );
exit unless $OUTFILE;
}
# check for the gzip and gunzip binaries (only if we need them)
if ( grep { m/\.gz$/ } @files ) {
print "Gzip format logs detected.\n";
while ( ! -x $GZIP and ! -x $GUNZIP ) {
unless ( -x $GZIP ) {
if ( -e $GZIP ) {
print "You don't have the perms to exec $GZIP! ?su\n"
+;
} else {
print "GZIP binary does not exist at $GZIP\n";
}
print "Path to gzip is usually /bin/gzip /usr/bin/gzip /us
+r/local/bin/gzip\nPath: ";
chomp ( $GZIP = <> );
}
unless ( -x $GUNZIP ) {
if ( -e $GUNZIP ) {
print "You don't have the perms to exec $GUNZIP! ?su\
+n";
} else {
print "GUNZIP binary does not exist at $GUNZIP\n";
}
print "Path to gunzip usually /bin/gunzip /usr/bin/gunzip
+/usr/local/bin/gunzip\nPath: ";
chomp ( $GUNZIP = <> );
}
}
}
# Finally we get to actually do the parse and make the count
for my $file(@files) {
print "Processing $file\n";
my $gz = $file =~ m/\.gz$/ ? 1 : 0;
if ( $gz ) {
print " Gunzipping $file\n";
if ( system("$GUNZIP $file") ) {
# we got an error code
print "*****$GUNZIP $file error.\nPerhaps the path to gunz
+ip is wrong?\nError: $!\n$@\n*****Skipping $file\n";
next;
}
$file =~ s/\.gz$//;
}
open F, $file or die "Can't read $file, Perl says $!";
while(<F>){
if ( m!(?:GET|POST)\s+(https?://[^/]+)! ) {
$urls{$1}++;
} elsif ( m!([A-Za-z\.0-9\-]+):443! ) {
$urls{"https://$1"}++;
}
}
close F;
if ( $gz ) {
print " Gzipping $file to restore to original state\n";
if ( system("$GZIP $file") ) {
# we have a gzip error code
# as we must have successfully gunzipped to get here
# this is a BIG problem as if we continue we will probably
+
# gunzip a whole lot of logs but not gzip them back again
# this might fill up the disk and make us unpopular so let
+'s abort
# first let's clean up nicely....
close OUT; unlink $OUTFILE;
die "*****Failed to GZIP $file!\nError $!\$@\n\nAborting!\n"
+;
}
}
}
# output and wrap it up
print OUT "$urls{$_}\t$_\n" for sort { $urls{$b} <=> $urls{$a} } keys
+%urls;
close OUT;
print "Done. Got " . scalar(keys %urls) . " unique domains\nWrote $OUT
+FILE\n";