#!/usr/local/bin/perl -w use strict; #use sort '_quicksort'; #use sort '_mergesort'; use Benchmark qw(cmpthese); use Netscape::History; use Netscape::HistoryURL; my $history = new Netscape::History("/home/parv/.netscape/history.dat"); my %hist; while ( defined (my $url = $history->next_url()) ) { ( $hist{ $url } = $url->title() ) =~ s/\s+/ /g; $hist{ $url } = $hist{ $url } || "--"; } $history->close(); # print some url statistics printf "URLs: %u\n min: %u, max: %u\n mean: %0.2f, median: %0.2f\n\n" , @{ stats() }; cmpthese( -10 , { 'ST'=> \&schwartz , 'GRT'=> \&guttman_rosler }); # Guttman-Rosler Transform sub guttman_rosler { print STDERR +($hist{$_} , "\n", $_ , "\n\n") foreach # extract the sorted complete URL list map { (split '\0' , $_ , 2)[1] } sort # set up for extracting the complete URL & sortkeys map { # extract host/domain components in reverse order my $scheme_end = index($_, '://') + 3; my @host = reverse split '\.' , (split '/' , substr($_ , $scheme_end) , 2)[0]; # put domain at front; everything else afterwords # ---- # poo.koo -> poo.koo # web.poo.koo:80 -> poo.koo:80.web # rand.web.poo.koo -> poo.koo.web.rand # ---- lc( +(scalar @host > 1 ? $host[1] . '.' : '') . $host[0] . ( scalar @host < 3 ? '' : '.' . join('.' , @host[2..$#host]) ) . substr($_ , length(join '.', @host) + $scheme_end) ) . "\x00" . $_ } keys %hist; return; } # Schwartzian Transform sub schwartz { print STDERR +($hist{$_} , "\n", $_ , "\n\n") foreach # extract the sorted complete URL list map { $_->[0] } # sort on massged host name or the complete url sort { $a->[1] cmp $b->[1] } # set up for extracting the complete URL & sortkeys map { # extract host/domain components in reverse order my @host = reverse split '\.' , ( split '/' , substr($_ , index($_, '://') + 3) , 2 )[0]; [ # current URL $_ , # put domain at front; everything else afterwords # ---- # poo.koo -> poo.koo # web.poo.koo:80 -> poo.koo:80.web # rand.web.poo.koo -> poo.koo.web.rand # ---- lc +(scalar @host > 1 ? $host[1] . '.' : '') . $host[0] . +( scalar @host < 3 ? '' : '.' . join('.' , @host[2..$#host]) ) . substr($_ , length(join '.', @host) + index($_, '://') + 3) ] } keys %hist; return; } # calculate some URL statistics sub stats { my $total = 0; my @len; foreach ( sort {length $a <=> length $b} keys %hist ) { my $len = length $_; $total += $len; push @len, $len; } my $number = scalar keys %hist; my $mid_pt = int($number / 2); my $median = $mid_pt % 2 == 0 ? $len[$mid_pt] : ($len[$mid_pt] + $len[$mid_pt + 1]) / 2; return [ $number , $len[0] , $len[-1] , $total/$number , $median ]; }