#!/usr/bin/perl -w # # ******************************************** use strict; use Carp; use Switch; use Data::Dumper; use Cwd; use HTTP::Request; use HTTP::Request::Common; use HTTP::Status; use LWP; use LWP::UserAgent; use HTML::TreeBuilder; my $FamilyPageURL = "http://e-familytree.net/F248/F248347.htm"; my $ua = LWP::UserAgent->new; if (defined $ua) { $ua->timeout(5); my $HTTP_Response = $ua->get($FamilyPageURL); my $HTTP_Status = $HTTP_Response->message ; if ($HTTP_Response->is_success) { my $HTTP_FamilyPage = $HTTP_Response->content; #Set up parser to parse this HTML Page # See: http://search.cpan.org/dist/HTML-Parser/Parser.pm # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/TreeBuilder.pm # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.pm # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Tree/Scanning.pod # # HTML::TreeBuilder is a subclass of HTML::Parser. # Set the Parser portions to control how the HTML is parsed into a tree my $PageAsTree = HTML::TreeBuilder->new(); $PageAsTree->backquote( 1) ; $PageAsTree->empty_element_tags( 1 ) ; $PageAsTree->utf8_mode( 1); $PageAsTree->xml_mode( 1); $PageAsTree->warn( 1); $PageAsTree->ignore_elements(qw(script style)); # Parse the HTML is parsed into a tree, signal the end of processin and remove the ability to parse more $PageAsTree->parse_content($HTTP_FamilyPage); $PageAsTree->elementify() ; $PageAsTree->normalize_content(); my $HtmlHead = $PageAsTree->look_down('_tag', 'head'); my $HtmlBody = $PageAsTree->look_down('_tag', 'body'); # Traverse and examine the tree for Sections of the page (Husband, Wife, Children, Notes) my @PageSections = $HtmlBody->look_down( sub { return (($_[0]->tag() eq 'div' ) and ($_[0]->attr('class') =~ m/^secTitle$/i)); } ); # searches for Husband, Wife and Children Sections. # transfer the HTML to the right of this node of HTML up until the next Section Title is found my %FamilyMember; foreach my $Node (@PageSections) { my $SectionTitle = $Node->content->[0]; my @SectionNodes = (); my @NodesOnRight = $Node->right; my $NumNodesOnTheRight = @NodesOnRight; print "\$NumNodesOnTheRight = $NumNodesOnTheRight\n @NodesOnRight\n"; my $RightNode1 = $Node->right; print "The node to the right of " . $Node->as_HTML . "is [" . $RightNode1 . "]"; print "[" . $RightNode1->as_HTML . "]"; print "\n"; for( my $RightNode = $Node->right; (defined $RightNode) && ($RightNode->attr('class') !~ m/^secTitle$/i); $RightNode = $RightNode->right) { push @SectionNodes, $RightNode; } $FamilyMember{$SectionTitle} = \@SectionNodes; } } }