#!/usr/local/bin/perl use strict; use warnings; use lib "$ENV{HOME}/mylib/lib/perl5"; use HTML::TableParser; use WWW::Mechanize; use HTML::TreeBuilder; use LWP::Simple; # Define debugging variable - set to positive integer to enable my $DEBUG_FLAG = 1; # Define variable that will contain the URL we will parse my $URL = 'Path to URL speakers.html'; # Define our tree using HTML::Treebuilder and parse the document my $tree = HTML::TreeBuilder->new; $tree->parse( get( $URL ) ); # Define our hash that will contain speaker names and their count my %speakers; # Look for the elements (speakers) we are searching for based on the anchor "a" tag my @elements = $tree->look_down( _tag => "a", \&find_speakers ); # Populate our speaker hash and intialize the value to 0 for my $element ( @elements ) { $speakers{$element->as_text} = 0; } # Print list of speakers if debug mode is enabled if ( defined $DEBUG_FLAG ) { foreach (sort keys %speakers) { print "$_\n"; } } # Loop through each speaker - the goal here is eventually count all Session and Tutorial # links for each speaker foreach (keys %speakers) { #check_sessions($_); # my $element = $tree->look_down( _tag => "a", # sub { shift->as_text eq $_ } ); # print $element->as_text() . "\n"; # my @rightlist = $element->right(); # print "@rightlist\n"; # my $count = 0; # while ($element->look_down( _tag => "li", \&count_sessions ) ) # { # $count++; # } # print "$_ = $count\n"; #$element->dump(); } sub check_sessions { #print "@_\n"; my $speaker = shift; my $element = $tree->look_down( _tag => "li" ); my $parent = $element->look_up( _tag => "a", sub { shift->as_text eq $speaker } ); if (defined $parent) { if ( $element->as_text() =~ /[Session:]|[Tutorial:]/ ) { print $element->as_text() . "\n"; return 1; } else { return 0; } } else { return 0; } } # find_speakers subroutine finds the 'speakers' within the HTML being parsed # based on the source being an anchor tag, it's parent tag not being a line and # it is within a span tag sub find_speakers { my $element = shift; my ($parent_tag) = $element->lineage_tag_names; # Our parent tag should NOT be a line and the element should be a 'span' tag $parent_tag ne 'li' && $element->look_up( _tag => 'span' ); } # count_sessions subroutine - this was one attempt at trying to get at the Session and tutorial links sub count_sessions { my $element = shift; print "Got to count_sessions\n"; my ($parent_tag) = $element->lineage_tag_names; $parent_tag eq 'ul' && ( $element->as_text eq "Session" || $element->as_text eq "Tutorial" ); } # in_list subroutine - not presently used sub in_list { my $element = shift; my ($parent_tag) = $element->lineage_tag_names; # Our parent tag should be a line and the element should be a 'span' tag $parent_tag eq 'li' && $element->look_up( _tag => 'span' ); } # find_top_speakers - placeholder code for subroutine that will find our top 3 speakers sub find_top_speakers { }