# 1675147 http://download.windowsupdate.com/ # 531450 http://ads.icq.com/ # 472384 http://direct.ninemsn.com.au/ # 299397 http://rad.msn.com/ # 255706 http://web.icq.com/ # 255578 http://www.google.com.au/ #### #!/usr/bin/perl use strict; my $SQUID_LOG_LOC = '/var/log/squid'; my $OUTFILE = '/tmp/log_parse.txt'; my $GZIP = '/bin/gzip'; my $GUNZIP = '/bin/gunzip'; my (%urls, @files); # find the logfiles while (1) { if ( -e $SQUID_LOG_LOC and -d $SQUID_LOG_LOC and -r $SQUID_LOG_LOC ) { @files = grep { /access.log(?:\.\d+)?(?:\.gz)?$/i } glob ("$SQUID_LOG_LOC/*"); } else { # take a punt on the other likely location of the squid logs # transparent to user.... do { $SQUID_LOG_LOC = '/usr/local/squid/var/logs'; next } if $SQUID_LOG_LOC eq '/var/log/squid'; print "$SQUID_LOG_LOC does not exist!\n" unless -e $SQUID_LOG_LOC; print "$SQUID_LOG_LOC is not a directory!\n" unless -d $SQUID_LOG_LOC; print "$SQUID_LOG_LOC is not readable! Do you need to su?\n" unless -r $SQUID_LOG_LOC; } last if @files; print "Did not find any squid logs in $SQUID_LOG_LOC.\nPlease Enter Path (Enter=Exit) "; chomp( $SQUID_LOG_LOC = <> ); exit unless $SQUID_LOG_LOC; $SQUID_LOG_LOC =~ s!/\s*$!!; } # get a filehandle to ouput to while ( 1 ) { last if open OUT, ">$OUTFILE"; print "\nCan't write output file $OUTFILE\nPerl says: $!\n"; print "Where can I write to (Full Path, Enter=Exit)? "; chomp ( $OUTFILE = <> ); exit unless $OUTFILE; } # check for the gzip and gunzip binaries (only if we need them) if ( grep { m/\.gz$/ } @files ) { print "Gzip format logs detected.\n"; while ( ! -x $GZIP and ! -x $GUNZIP ) { unless ( -x $GZIP ) { if ( -e $GZIP ) { print "You don't have the perms to exec $GZIP! ?su\n"; } else { print "GZIP binary does not exist at $GZIP\n"; } print "Path to gzip is usually /bin/gzip /usr/bin/gzip /usr/local/bin/gzip\nPath: "; chomp ( $GZIP = <> ); } unless ( -x $GUNZIP ) { if ( -e $GUNZIP ) { print "You don't have the perms to exec $GUNZIP! ?su\n"; } else { print "GUNZIP binary does not exist at $GUNZIP\n"; } print "Path to gunzip usually /bin/gunzip /usr/bin/gunzip /usr/local/bin/gunzip\nPath: "; chomp ( $GUNZIP = <> ); } } } # Finally we get to actually do the parse and make the count for my $file(@files) { print "Processing $file\n"; my $gz = $file =~ m/\.gz$/ ? 1 : 0; if ( $gz ) { print " Gunzipping $file\n"; if ( system("$GUNZIP $file") ) { # we got an error code print "*****$GUNZIP $file error.\nPerhaps the path to gunzip is wrong?\nError: $!\n$@\n*****Skipping $file\n"; next; } $file =~ s/\.gz$//; } open F, $file or die "Can't read $file, Perl says $!"; while(){ if ( m!(?:GET|POST)\s+(https?://[^/]+)! ) { $urls{$1}++; } elsif ( m!([A-Za-z\.0-9\-]+):443! ) { $urls{"https://$1"}++; } } close F; if ( $gz ) { print " Gzipping $file to restore to original state\n"; if ( system("$GZIP $file") ) { # we have a gzip error code # as we must have successfully gunzipped to get here # this is a BIG problem as if we continue we will probably # gunzip a whole lot of logs but not gzip them back again # this might fill up the disk and make us unpopular so let's abort # first let's clean up nicely.... close OUT; unlink $OUTFILE; die "*****Failed to GZIP $file!\nError $!\$@\n\nAborting!\n"; } } } # output and wrap it up print OUT "$urls{$_}\t$_\n" for sort { $urls{$b} <=> $urls{$a} } keys %urls; close OUT; print "Done. Got " . scalar(keys %urls) . " unique domains\nWrote $OUTFILE\n";