#    1675147	http://download.windowsupdate.com/
#    531450	http://ads.icq.com/
#    472384	http://direct.ninemsn.com.au/
#    299397	http://rad.msn.com/
#    255706	http://web.icq.com/
#    255578	http://www.google.com.au/

##</code><code>##

#!/usr/bin/perl

use strict;

my $SQUID_LOG_LOC = '/var/log/squid';
my $OUTFILE       = '/tmp/log_parse.txt';
my $GZIP          = '/bin/gzip';
my $GUNZIP        = '/bin/gunzip';

my (%urls, @files);

# find the logfiles

while (1) {
    if ( -e $SQUID_LOG_LOC and -d $SQUID_LOG_LOC and -r $SQUID_LOG_LOC ) {
        @files = grep { /access.log(?:\.\d+)?(?:\.gz)?$/i } glob ("$SQUID_LOG_LOC/*");

    }
    else {
        # take a punt on the other likely location of the squid logs
        # transparent to user....
        do { $SQUID_LOG_LOC = '/usr/local/squid/var/logs'; next } if $SQUID_LOG_LOC eq '/var/log/squid';
        print "$SQUID_LOG_LOC does not exist!\n" unless -e $SQUID_LOG_LOC;
        print "$SQUID_LOG_LOC is not a directory!\n" unless -d $SQUID_LOG_LOC;
        print "$SQUID_LOG_LOC is not readable! Do you need to su?\n"  unless -r $SQUID_LOG_LOC;
    }
  last if @files;
    print "Did not find any squid logs in $SQUID_LOG_LOC.\nPlease Enter Path (Enter=Exit) ";
    chomp( $SQUID_LOG_LOC = <> );
    exit unless $SQUID_LOG_LOC;
    $SQUID_LOG_LOC =~ s!/\s*$!!;
}

# get a filehandle to ouput to

while ( 1 ) {
  last if open OUT, ">$OUTFILE";
    print "\nCan't write output file $OUTFILE\nPerl says: $!\n";
    print "Where can I write to (Full Path, Enter=Exit)? ";
    chomp ( $OUTFILE = <> );
  exit unless $OUTFILE;
}

# check for the gzip and gunzip binaries (only if we need them)

if ( grep { m/\.gz$/ } @files ) {
    print "Gzip format logs detected.\n";
    while ( ! -x $GZIP and ! -x $GUNZIP ) {
        unless ( -x $GZIP ) {
            if ( -e $GZIP ) {
                 print "You don't have the perms to exec $GZIP! ?su\n";
            } else {
                 print "GZIP binary does not exist at $GZIP\n";
            }
            print "Path to gzip is usually /bin/gzip /usr/bin/gzip /usr/local/bin/gzip\nPath: ";
            chomp ( $GZIP = <> );
        } 
        unless ( -x $GUNZIP ) {
            if ( -e $GUNZIP ) {
                 print "You don't have the perms to exec $GUNZIP! ?su\n";
            } else {
                 print "GUNZIP binary does not exist at $GUNZIP\n";
            }
            print "Path to gunzip usually /bin/gunzip /usr/bin/gunzip /usr/local/bin/gunzip\nPath: ";
            chomp ( $GUNZIP = <> );
        }    
    }
}


# Finally we get to actually do the parse and make the count

for my $file(@files) {
    print "Processing $file\n";
    my $gz = $file =~ m/\.gz$/ ? 1 : 0;
    if ( $gz ) {
        print "  Gunzipping $file\n";
        if ( system("$GUNZIP $file") ) {
            # we got an error code
            print "*****$GUNZIP $file error.\nPerhaps the path to gunzip is wrong?\nError: $!\n$@\n*****Skipping $file\n";
          next;
        }
        $file =~ s/\.gz$//;
    }
    open F, $file or die "Can't read $file, Perl says $!";
    while(<F>){
        if ( m!(?:GET|POST)\s+(https?://[^/]+)! ) {
            $urls{$1}++;
        } elsif ( m!([A-Za-z\.0-9\-]+):443! ) {
            $urls{"https://$1"}++;
        }
    }
    close F;
    if ( $gz ) {
        print "  Gzipping $file to restore to original state\n";
        if ( system("$GZIP $file") ) {
            # we have a gzip error code
            # as we must have successfully gunzipped to get here
            # this is a BIG problem as if we continue we will probably 
            # gunzip a whole lot of logs but not gzip them back again
            # this might fill up the disk and make us unpopular so let's abort
            # first let's clean up nicely....
            close OUT; unlink $OUTFILE;
          die "*****Failed to GZIP $file!\nError $!\$@\n\nAborting!\n";
        }
    }
}

# output and wrap it up
print OUT "$urls{$_}\t$_\n" for sort { $urls{$b} <=> $urls{$a} } keys %urls;
close OUT;
print "Done. Got " . scalar(keys %urls) . " unique domains\nWrote $OUTFILE\n";