#!/usr/local/bin/perl -w use strict; #use sort '_quicksort'; #use sort '_mergesort'; #use sort 'stable'; use Benchmark qw(cmpthese); # (global) url & page title hash my %hist; # fill %hist fill(); # print some url statistics length_stat( [ keys %hist ] ); #printf "\nCurrent sort: %s\n" , sort::current(); cmpthese( -5 , { 'GRT' => \&guttman_rosler , 'GRT UK' => \&guttman_rosler_uk , 'ST' => \&schwartz } ); # Guttman-Rosler Transform sub guttman_rosler { #print STDERR +($hist{$_} , "\n", $_ , "\n\n") print STDERR +( $_ , "\n\n") foreach map { substr( $_ , index ($_ , $;) +1) } sort map { # extract host/domain components in reverse order my $scheme_end = index($_, '://') + 3; my @host = reverse split '\.' , (split '/' , substr($_ , $scheme_end) , 2)[0]; my $items = scalar @host; # put domain at front, then host parts if any, then everything else lc( # poo.koo -> poo_koo ($items > 1 ? $host[1] . '_' : '') . $host[0] # rand.web.poo.koo -> poo_koo_web_rand . ( $items < 3 ? '' : '_' . join('_' , @host[2 .. $items -1]) ) # http://rand.web.poo.koo -> poo_koo_web_rand_http:// . '_' . substr($_ , 0 , $scheme_end) # http://rand.web.poo.koo/blah/blah # -> poo_koo_web_rand_http:///blah/blah . substr($_ , length(join '_', @host) + $scheme_end) ) . $; . $_ } keys %hist; return; } # Guttman-Rosler Transform -- uses regex instead of substr()/index() sub guttman_rosler_uk { #print STDERR +($hist{$_} , "\n", $_ , "\n\n") print STDERR +( $_ , "\n\n") foreach map { substr( $_ , index ($_ , $;) +1) } sort map { m!^ (?: (\w+):// )? ([^/]+) !x; my @host = reverse split '\.' , $2; my $items = scalar @host; # put domain at front; everything else afterwords lc( ($items > 1 ? $host[1] . '_' : '') . $host[0] . ( $items < 3 ? '' : '_' . join '_' , @host[2 .. $items -1] ) . '_' . ($1 || '') . substr($_ , index($_ , $2) + length $2) ) . $; . $_ ; } keys %hist; return; } # Schwartzian Transform sub schwartz { #print STDERR +($hist{$_} , "\n", $_ , "\n\n") print STDERR +( $_ , "\n\n") foreach map { $_->[0] } sort { $a->[1] cmp $b->[1] || $a->[0] cmp $b->[0] } map { # extract host/domain components in reverse order my $scheme_end = index($_, '://') + 3; my @host = reverse split '\.' , (split '/' , substr($_ , $scheme_end) , 2)[0]; my $items = scalar @host; [ $_ , # put domain at front; everything else afterwords lc( ($items > 1 ? $host[1] . '_' : '') . $host[0] . ( $items < 3 ? '' : '_' . join('_' , @host[2 .. $items -1]) ) . '_' . substr($_ , 0 , $scheme_end) . substr($_ , length(join '_', @host) + $scheme_end) ) ] } keys %hist; return; } # fill %hist sub fill { use Netscape::History; use Netscape::HistoryURL; my $history = new Netscape::History("/home/parv/.netscape/history.dat"); while ( defined (my $url = $history->next_url()) ) { ( $hist{ $url } = $url->title() ) =~ s/\s+/ /g; $hist{ $url } = $hist{ $url } || "--"; } $history->close(); } # calculate some statistics of elements' lengths sub length_stat { # store sorted elements my @in = sort {length $a <=> length $b} @{ $_[0] }; my %stat; @stat{qw( length_sum count min max median mean var std_dev )} = (0 , scalar @in ); # convert elements to their lengths; get sum of lengths $stat{'length_sum'} += ($_ = length) foreach @in; @stat{qw( min max mean )} = ( @in[0 , -1] , $stat{'length_sum'} / $stat{'count'} ); # median { my $mid = int($stat{'count'} / 2); $stat{'median'} = ($mid % 2 != 0) ? $in[ $mid ] : ($in[ $mid ] + $in[ $mid - 1 ]) / 2; } # variance, thus std. deviation { my $sum_of_diffsq = 0; foreach (@in) { my $diff = $_ - $stat{'mean'}; $sum_of_diffsq += ($diff * $diff); } $stat{'std_dev'} = sqrt( $stat{'var'} = $sum_of_diffsq / $stat{'count'} ); } # print stats { my $fmt = "%0.3f"; print << "_STATS_"; URL Length Statistics: Total URLs: $stat{'count'} Lengths' Sum: $stat{'length_sum'} Min: $stat{'min'} Max: $stat{'max'} Mean: @{[sprintf $fmt, $stat{'mean'}]} Median: @{[sprintf $fmt, $stat{'median'}]} Std. Deviation: @{[sprintf $fmt, $stat{'std_dev'}]} Variance: @{[sprintf $fmt, $stat{'var'}]} _STATS_ } return; }