in reply to Parse HTML page for links and count by author

You can use the same basic routine you have for speakers to extract the other data, just filter accordingly with some logic.
#!/usr/local/bin/perl use strict; use warnings; use HTML::TreeBuilder; my $html = do{local $/;<DATA>}; my $tree = HTML::TreeBuilder->new; $tree->parse( $html ); my @nodes = $tree->look_down( _tag => "a", \&a_tag ); sub a_tag { my ($element) = @_; my $parent = $element->parent; my $text = $element->as_text; if ($parent->tag eq 'span'){ print "Speaker = $text\n"; # set current author } elsif ($parent->tag eq 'b' && $parent->as_text =~/(Session|Tutorial)/){ print "$1 = $text\n"; # add record to current author } }
poj

Replies are listed 'Best First'.
Re^2: Parse HTML page for links and count by author
by Anonymous Monk on Jun 08, 2014 at 03:15 UTC

    Many thanks poj. Got me across the line. Final version of my program appears below

    #!/usr/local/bin/perl use strict; use warnings; use lib "$ENV{HOME}/mylib/lib/perl5"; use HTML::TreeBuilder; use LWP::Simple; # Program Name: top_speakers.pl # Author: XXXXX # Purpose: Parses the page http://perlcourse.ecorp.net/conf-mirror/ +conferences.oreillynet.com/speakers.html # and finds the speakers who had the most sessions and/or tutor +ials # Original code only found sessions or tutorials, adjusted code + to find Sessions, Tutorials, BOF's & Panels # to match expected output per project specification # Define debugging variable - set to positive integer to enable my $DEBUG_FLAG = 0; # Define variable that will contain the URL we will parse my $URL = 'http://perlcourse.ecorp.net/conf-mirror/conferences.oreilly +net.com/speakers.html'; # Define our tree using HTML::Treebuilder and parse the document my $tree = HTML::TreeBuilder->new; $tree->parse( get( $URL ) ); # Define our hash that will contain speaker names and their count my %speakers; # Define current speaker variable - used in find_speakers subroutine my $current_speaker; my @nodes = $tree->look_down( _tag => "a", \&find_speakers ); # If in debug mode, Print list of speaker and their total of Sessions +or Tutorials if ( $DEBUG_FLAG ) { foreach (sort keys %speakers) { print "$_ = ($speakers{$_})\n"; } } # Set a counter to limit our results, call our sorting routine to # sort in descending order (highest to lowest) and print results # Exit loop once we have 3 speakers displayed. # Technically if there are speakers with the same amount of speaking # engagements they should be weighted equally (equal third etc) but # this was not in the project requirements my $counter = 0; foreach my $key (sort hashValueDescending (keys(%speakers))) { print "$key\t($speakers{$key})\n"; $counter++; last if $counter == 3; } # Delete tree object to free up the memory (Best practice) $tree->delete; # find_speakers subroutine - finds speakers, adds their name to the %s +peakers hash # then looks for Sessions, Tutorials, BOFs or Panels that the speaker +is presenting # and adds those to the total for each speaker sub find_speakers { my ($element) = @_; my $parent = $element->parent; my $text = $element->as_text; # Check if tag is a 'span' as this was consistent for delineating th +e speakers # throughout the document if ($parent->tag eq 'span'){ print "Speaker = $text\n" if $DEBUG_FLAG; # add current speaker to the hash and initialize to zero # Note: We would need an alternative method if a speaker link appe +ared more than once $speakers{$text} = 0; # set current speaker $current_speaker = $text; } # Check if the parent tag is a bold element and if the text matche +s one # of our criteria - Session, Tutorial, BOF or Panel elsif ($parent->tag eq 'b' && $parent->as_text =~/(Session|Tutorial|BOF|Panel)/){ print "$1 = $text\n" if $DEBUG_FLAG; # add record to current speaker - set counter to current speaker c +ontents and increment by 1 # then assign to the $speaker hash my $count = $speakers{$current_speaker} + 1; $speakers{$current_speaker} = $count; } } # hashValueDescending subroutine - sorts the hash in descending numeri +cal order # from highest down to lowest sub hashValueDescending { $speakers{$b} <=> $speakers{$a}; }