#!/usr/local/bin/perl -w

use strict;

#use sort '_quicksort';
#use sort '_mergesort';
#use sort 'stable';

use Benchmark qw(cmpthese);

#  (global) url & page title hash
my %hist;

#  fill %hist
fill();

#  print some url statistics
length_stat( [ keys %hist ] );

#printf "\nCurrent sort: %s\n" , sort::current();

cmpthese( -5 , { 'GRT'  => \&guttman_rosler
                , 'GRT UK'  => \&guttman_rosler_uk
                , 'ST'  => \&schwartz
                }
        );


#  Guttman-Rosler Transform
sub guttman_rosler
{
  #print STDERR +($hist{$_} , "\n", $_ , "\n\n")
  print STDERR +( $_ , "\n\n")
    foreach
      map { substr( $_ , index ($_ , $;) +1) }
      sort
      map { #  extract host/domain components in reverse order
            my $scheme_end = index($_, '://') + 3;

            my @host =
              reverse
              split '\.' , (split '/' , substr($_ , $scheme_end) , 2)[0];

            my $items = scalar @host;

            #  put domain at front, then host parts if any, then everything else
            lc( #  poo.koo -> poo_koo
                ($items > 1 ? $host[1] . '_' : '')
                . $host[0]

                #  rand.web.poo.koo -> poo_koo_web_rand
                . ( $items < 3
                    ? '' : '_' . join('_' , @host[2 .. $items -1])
                  )

                #  http://rand.web.poo.koo -> poo_koo_web_rand_http://
                . '_' . substr($_ , 0 , $scheme_end)

                #  http://rand.web.poo.koo/blah/blah
                #    -> poo_koo_web_rand_http:///blah/blah
                . substr($_ , length(join '_', @host) + $scheme_end)
              )
            . $; . $_
          } keys %hist;
  return;
}

#  Guttman-Rosler Transform -- uses regex instead of substr()/index()
sub guttman_rosler_uk
{
  #print STDERR +($hist{$_} , "\n", $_ , "\n\n")
  print STDERR +( $_ , "\n\n")
    foreach
      map { substr( $_ , index ($_ , $;) +1) }
      sort
      map
      { m!^ (?: (\w+):// )? ([^/]+) !x;

        my @host = reverse split '\.' , $2;
        my $items = scalar @host;

        #  put domain at front; everything else afterwords
        lc( ($items > 1 ? $host[1] . '_' : '')
            . $host[0]
            . ( $items < 3
                ?  '' : '_' . join '_' , @host[2 .. $items -1]
              )
            .  '_' . ($1 || '')
            . substr($_ , index($_ , $2) + length $2)
          )
        . $; . $_ ;
      } keys %hist;
  return;
}

#  Schwartzian Transform
sub schwartz
{
  #print STDERR +($hist{$_} , "\n", $_ , "\n\n")
  print STDERR +( $_ , "\n\n")
    foreach
      map { $_->[0] }
      sort { $a->[1] cmp $b->[1] || $a->[0] cmp $b->[0] }
      map { #  extract host/domain components in reverse order
            my $scheme_end = index($_, '://') + 3;

            my @host =
              reverse
              split '\.' , (split '/' , substr($_ , $scheme_end) , 2)[0];

            my $items = scalar @host;

            [ $_

            , #  put domain at front; everything else afterwords
              lc( ($items > 1 ? $host[1] . '_' : '')
                  . $host[0]
                  . ( $items < 3
                        ? '' : '_' . join('_' , @host[2 .. $items -1])
                      )
                  . '_' . substr($_ , 0 , $scheme_end)
                  . substr($_ , length(join '_', @host) + $scheme_end)
                )
            ]
          } keys %hist;
  return;
}

#  fill %hist
sub fill
{
  use Netscape::History;
  use Netscape::HistoryURL;

  my $history = new Netscape::History("/home/parv/.netscape/history.dat");
  while ( defined (my $url = $history->next_url()) )
  {
    ( $hist{ $url } = $url->title() ) =~ s/\s+/ /g;
    $hist{ $url } = $hist{ $url } || "--";
  }
  $history->close();
}

#  calculate some statistics of elements' lengths
sub length_stat
{
  #  store sorted elements
  my @in = sort {length $a <=> length $b} @{ $_[0] };

  my %stat;
  @stat{qw( length_sum count min max median mean var std_dev )}
    = (0 , scalar @in );

  #  convert elements to their lengths; get sum of lengths
  $stat{'length_sum'} += ($_ = length) foreach @in;

  @stat{qw( min max mean )} =
    ( @in[0 , -1] , $stat{'length_sum'} / $stat{'count'} );

  #  median
  { my $mid = int($stat{'count'} / 2);
    $stat{'median'} =
      ($mid % 2 != 0) ? $in[ $mid ] : ($in[ $mid ] + $in[ $mid - 1 ]) / 2;
  }

  #  variance, thus std. deviation
  { my $sum_of_diffsq = 0;
    foreach (@in)
    {
      my $diff = $_ - $stat{'mean'};
      $sum_of_diffsq += ($diff * $diff);
    }
    $stat{'std_dev'} =
      sqrt( $stat{'var'} = $sum_of_diffsq / $stat{'count'} );
  }

  #  print stats
  { my $fmt = "%0.3f";

    print << "_STATS_";
URL Length Statistics:
  Total URLs: $stat{'count'}  Lengths' Sum: $stat{'length_sum'}
  Min: $stat{'min'}  Max: $stat{'max'}
  Mean: @{[sprintf $fmt, $stat{'mean'}]}  Median: @{[sprintf $fmt, $stat{'median'}]}
  Std. Deviation: @{[sprintf $fmt, $stat{'std_dev'}]}  Variance: @{[sprintf $fmt, $stat{'var'}]}

_STATS_
  }

  return;
}