webtimeload.pl -u www.perlmonks.org ##
##

#!/usr/bin/perl
use strict;
use warnings;
use LWP::UserAgent;
use Time::HiRes qw(gettimeofday tv_interval);
use Getopt::Long;
use HTML::Parse;

$|++;

# options
my (@url, $count, $sleep, $timeout,$showhead,$protocol,$verbosity);
#other  global
my ($ssep, $bsep,%stat);

$bsep = ('=' x 68)."\n";
$ssep = ('-' x 68)."\n";
&helpme()if( (! GetOptions(
                          'count=s' => \$count,
                          'url=s' => \@url,
                          'sleep=n' => \$sleep,
                          'timeout=n' => \$timeout,
                          'header' =>  \$showhead,
                          'protocol=s' => \$protocol,
                          'verbosity=n' => \$verbosity,
                       )) or (!$url[0]));
# some default values
$count ||= 1;
$sleep ||= 1;
$timeout||=15;
$verbosity = defined $verbosity ? $verbosity : 1 ;
$protocol||= 'http';
$protocol =~ s![:/]+!!g;
################################################################################
# LWP::UA initialize
my $ua = LWP::UserAgent->new;
$ua->agent("libwww-perl/5.10.1");
$ua->timeout($timeout);
################################################################################
foreach my $repetition(1..$count) {
  foreach my $u (@url) {
            unless ( $u =~ m/^$protocol/ ){$u=$protocol.'://'.$u};
            $u =~s/\/$//;       #removing an eventual / as last char
            $u=lc($u);

            undef %stat;
            %stat=(
                  sbody     =>0,            # size of the body skeleton of each frame
                  sintlink  =>0,            # size internal linked content
                  sextlink  =>0,            # size external linked content
                  tbody     =>0,            # time taken by body skeleton of each frame
                  tintlink  =>0,            # time taken by internal content
                  textlink  =>0,            # time taken by external content
                  cintlink  =>0,            # count of internal link
                  cextlink  =>0,            # count of external link
                  brokenlink=>[],           # broken links
                  mainurl   => $u,          # url got as arg
                  pages     =>[$u],         # main page or all the frames
                  cache     =>{},           # cache for included resources
                  respcode  =>undef,        #
                  respmess  =>undef,        #   some responses
                  resptitle =>undef,        #
                  respserver=>undef,        #
                  resplength=>undef,        #
                  resphead  =>undef,        #

            );
            foreach my $page ( @{$stat{'pages'}})
                  { &get_page($page)   }
            foreach my $content_link (keys %{$stat{'cache'}})
                  { &time_link($content_link) }
           &report;
  }
  sleep ($sleep) unless $repetition == $count;
}
################################################################################
sub get_page  {
    my $urltoget = shift;
    my  $t0 = [gettimeofday];
    my $resp = $ua->get($urltoget);
    $stat{tbody} +=  tv_interval ($t0, [gettimeofday]);
    # add the content_length declared for EACH FRame/page loaded
    $stat{resplength} += ($resp->content_length||0);
    #  add the real bytes length obtained for EACH FRame/page loaded
    $stat{sbody} += (length ($resp->content)|| 0);
    # add some more info only for the principal page (not for any frame loaded)
    $stat{respcode} = $stat{respcode} || $resp->code;
    $stat{respmess} = $stat{respmess} || $resp->message;
    $stat{resptitle} = $stat{resptitle} || $resp->title;
    $stat{respserver} = $stat{respserver} || $resp->server;
    $stat{resphead} = $stat{resphead} || $resp->headers_as_string;

    # now parse the HTLM
    my $parsed_html = parse_html($resp->content);
    foreach my $link_found(@{ $parsed_html->extract_links(qw(body img src frame )) }) {
                  next if $$link_found[0] =~ /#/; # brutally skip anchors
                  my $uriobj = URI->new( $$link_found[0]);
		              my $absurl   = $uriobj->abs($urltoget);

             #if is a frame add to pages adding an iteration to this sub
               if ($$link_found[3] eq 'frame') {push @{$stat{'pages'}}, "$absurl";next}   #? need to stringify $absurl
             #else is a content and we add this to the cache hash
               else {$stat{cache}{ $absurl }=[] }  # will store there length and time later on
   }
}
################################################################################
sub time_link  {
    my $url = shift;

    my  $t0 = [gettimeofday];
    my $resp = $ua->get($url);
    my $ttaken = tv_interval ($t0, [gettimeofday]);
    my $bytesrec = length($resp->content);

    if ($resp->is_success())  {
        @{$stat{'cache'}{$url}} = ($bytesrec, $ttaken );
        #official regex from http://search.cpan.org/~gaas/URI-1.58/URI.pm
        my($scheme, $domain, $path, $query, $fragment) =
            $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
             #tell internal from external content
             if ($stat{mainurl} =~ /$domain/) {
                      $stat{sintlink} +=  $bytesrec ;
                      $stat{tintlink} +=  $ttaken ;
                      $stat{cintlink}++;
              }
              else {
                      $stat{sextlink} +=  $bytesrec ;
                      $stat{textlink} +=  $ttaken ;
                      $stat{cextlink}++;
              }
    }
    else {
        push @{$stat{'brokenlink'}},$url;
    }
}

################################################################################
sub  Arrotonda_Mega
 {
   my( $size, $n ) =( shift, 0 );
   return "0 bytes" unless defined $size;
   return "0 bytes" unless  $size > 0;
   ++$n and $size /= 1024 until $size < 1024;
    return sprintf "%.4f %s",
          $size, ( qw[ bytes Kb Mb Gb ] )[ $n ];
 }
################################################################################
sub report  {
         my $totsize = $stat{'sbody'} + $stat{'sintlink'} + $stat{'sextlink'};
         my $tottime = $stat{'tbody'} + $stat{'tintlink'} + $stat{'textlink'};
         ####################
         if ($verbosity == 0){
            print scalar localtime (time),"\t",
                  $stat{mainurl},"\t",$stat{respcode},"\t",
                  $totsize,"\t",
                  $tottime,"\t",
                  (&Arrotonda_Mega($totsize / $tottime) ),"/sec","\n";
         }
         ####################
         elsif ($verbosity == 1){
            print $bsep,$stat{mainurl},"\t",scalar localtime (time),"\n",$ssep,
                  $stat{respcode},"\t",$stat{respmess}||'UNDEF',"\t",$stat{resptitle}||'UNDEF',"\n",
                  $ssep,
                  "downloaded ",&Arrotonda_Mega($totsize)," ($totsize bytes) in ",
                  $tottime," seconds (",
                  &Arrotonda_Mega($totsize / $tottime),"/s)\n",$bsep;

         }
         ####################
         elsif ($verbosity > 1){
            print $bsep,$stat{mainurl},"\t",scalar localtime (time),"\n",$ssep,
                  "Response code:            ",$stat{respcode}||'UNDEF',"\n",
                  "Response message:         ",$stat{respmess}||'UNDEF',"\n",
                  "Response server:          ",$stat{respserver}||'UNDEF',"\n",
                  "Response declared length: ",$stat{resplength}||'UNDEF',"\n",
                  "Response title:           ",$stat{resptitle}||'UNDEF',"\n",
                  $ssep,
                      "main page content (",scalar @{$stat{pages}},"):\t",&Arrotonda_Mega($stat{sbody}),
                      " in ",$stat{tbody}," seconds \@ ",
                      &Arrotonda_Mega($stat{sbody} / $stat{tbody}),"/s\n";
                   if ($verbosity > 2) {
                          print $ssep,"\tdetail of loaded pages (url):\n",$ssep,
                                "\t",join ("\n\t", @{$stat{pages}}),"\n",$ssep;
                   }
             ### report about extra downloaded content locale to $url
             if ($stat{cintlink} > 0) {
                 print "included content ($stat{cintlink}):\t",&Arrotonda_Mega($stat{sintlink}),
                      " in ",$stat{tintlink}," seconds \@ ",
                      &Arrotonda_Mega($stat{sintlink} / $stat{tintlink}),"/s\n";
             }
             else {print "no included content found.\n"}
             ### report about extra downloaded content external to $url
             if ($stat{cextlink} > 0) {
                 print "external content  ($stat{cextlink}):\t",&Arrotonda_Mega($stat{sextlink}),
                      " in ",$stat{textlink}," seconds \@ ",
                      &Arrotonda_Mega($stat{sextlink} / $stat{textlink}),"/s\n";
             }
             else {print "no external content found.\n"}
             ### report about broken links
             if (scalar @{$stat{brokenlink}} > 0) {
                 print    "broken links found:\t",scalar @{$stat{brokenlink}},"\n",$ssep;
                 if ($verbosity > 2){
                            print "\tdetail of broken kins (url:\n",$ssep,
                            (join ("\n", @{$stat{brokenlink}})),$ssep;
                 }
             }
             else {print "no broken links found.\n",$ssep}

             if ( ($verbosity > 2) && keys %{$stat{cache}} > 0) {
                          print "\tdetail of loaded content (url bytes seconds):\n",$ssep,
                                (map { "\t$_ @{$stat{cache}{$_}}\n" }  sort keys %{$stat{cache}}  ),
                                $ssep;
                   }
             ## total
             print "downloaded ",&Arrotonda_Mega($totsize)," ($totsize bytes) in ",
                  $tottime," seconds (",
                  &Arrotonda_Mega($totsize / $tottime),"/s)\n",$bsep;

         }
         # verbosity set to smtng strange switch to defaults
         else{ $verbosity = 1; &report  }
         # eventually print headers
         if ($showhead) {print "headers received:\n",$ssep,$stat{resphead},"\n",$bsep;}
}
################################################################################
sub helpme {
 print <