johnwashburn has asked for the wisdom of the Perl Monks concerning the following question:
#!/usr/bin/perl -w # # ******************************************** use strict; use Carp; use Switch; use Data::Dumper; use Cwd; use HTTP::Request; use HTTP::Request::Common; use HTTP::Status; use LWP; use LWP::UserAgent; use HTML::TreeBuilder; my $FamilyPageURL = "http://e-familytree.net/F248/F248347.htm"; my $ua = LWP::UserAgent->new; if (defined $ua) { $ua->timeout(5); my $HTTP_Response = $ua->get($FamilyPageURL); my $HTTP_Status = $HTTP_Response->message ; if ($HTTP_Response->is_success) { my $HTTP_FamilyPage = $HTTP_Response->content; #Set up parser to parse this HTML Page # See: http://search.cpan.org/dist/HTML-Parser/Parser.pm # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/TreeBuild +er.pm # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.p +m # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Tree/Scan +ning.pod # # HTML::TreeBuilder is a subclass of HTML::Parser. # Set the Parser portions to control how the HTML is parsed in +to a tree my $PageAsTree = HTML::TreeBuilder->new(); $PageAsTree->backquote( 1) ; $PageAsTree->empty_element_tags( 1 ) ; $PageAsTree->utf8_mode( 1); $PageAsTree->xml_mode( 1); $PageAsTree->warn( 1); $PageAsTree->ignore_elements(qw(script style)); # Parse the HTML is parsed into a tree, signal the end of proc +essin and remove the ability to parse more $PageAsTree->parse_content($HTTP_FamilyPage); $PageAsTree->elementify() ; $PageAsTree->normalize_content(); my $HtmlHead = $PageAsTree->look_down('_tag', 'head'); my $HtmlBody = $PageAsTree->look_down('_tag', 'body'); # Traverse and examine the tree for Sections of the page (Husb +and, Wife, Children, Notes) my @PageSections = $HtmlBody->look_down( sub { + return (($_[0]->tag() eq 'div' ) + and ($_[0]->attr('class') =~ m/^secTitle$/i)); + } + ); # searches for Husband, Wife and Children Sections. # transfer the HTML to the right of this node of HTML up until + the next Section Title is found my %FamilyMember; foreach my $Node (@PageSections) { my $SectionTitle = $Node->content->[0]; my @SectionNodes = (); my @NodesOnRight = $Node->right; my $NumNodesOnTheRight = @NodesOnRight; print "\$NumNodesOnTheRight = $NumNodesOnTheRight\n @Nodes +OnRight\n"; my $RightNode1 = $Node->right; print "The node to the right of " . $Node->as_HTML . "is [ +" . $RightNode1 . "]"; print "[" . $RightNode1->as_HTML . "]"; print "\n"; for( my $RightNode = $Node->right; (defined $RightNode) && ($RightNode->attr('class') !~ +m/^secTitle$/i); $RightNode = $RightNode->right) { push @SectionNodes, $RightNode; } $FamilyMember{$SectionTitle} = \@SectionNodes; } } }
for( my $RightNode = $Node->right; (defined $RightNode) && ($RightNode->attr('class') !~ m/^secTitle$/i); $RightNode = $RightNode->right) { }
my @NodesOnRight = $Node->right; my $NumNodesOnTheRight = @NodesOnRight; print "\$NumNodesOnTheRight = $NumNodesOnTheRight\n @NodesOnRight\n"; my $RightNode1 = $Node->right; print "The node to the right of " . $Node->as_HTML . "is [" . $RightNo +de1 . "]"; print "[" . $RightNode1->as_HTML . "]"; print "\n";
$NumNodesOnTheRight = 51 HTML::Element=HASH(0x30f5344) Other Spouses: HTML::Element=HASH(0x30f86d4) HTML::Element=HASH(0x30f +8724) Father: HTML::Element=HASH(0x30f87a4) Mother: HTML::Element=HASH(0x30f8834) HTML::Element=HASH(0x30f8894) HTML::Element=HASH(0x30f8904) HTML::Element=HASH(0x30f89f4) Born: Died: Bef 21 Nov 1717 Father: HTML::Element=HASH(0x30f8bb4) Mother: HTML::Element=HASH(0x30f8c44) HTML::Element=HASH(0x30f8ca4) HTML::Element=HASH(0x30f8d14) HTML::Element=HASH(0x30f8e04) Born: Bet 1695 and 1723 Died: Bet 1748 and 1808 Wife: HTML::Element=HASH(0x30f8f44) HTML::Element=HASH(0x30f8fa4) HTML::Element=HASH(0x30f9084) Born: Abt 1702 Died: 14 Oct 1783 at Bridgewater, Plymouth, MA Husband: HTML::Element=HASH(0x30f91c4) HTML::Element=HASH(0x30f9224) HTML::Element=HASH(0x30f9304) Born: Abt 1704 Died: HTML::Element=HASH(0x30f93a4) HTML::Element=HASH(0x30f9484) Born: Bef 1710 Died: 1793 at Marlborough, MA Husband: HTML::Element=HASH(0x30fcfc4) HTML::Element=HASH(0x30fd024) HTML::Element=HASH(0x30fd104) Born: Abt 1710 Died: Husband: HTML::Element=HASH(0x30fd234) HTML::Element=HASH(0x30fd294) HTML::Element=HASH(0x30fd304) The node to the right of <div class="secTitle">HUSBAND</div> is [ ] Can't call method "as_HTML" without a package or object reference at T +raverseTree.pl line 75.
In scalar context: returns the node that's the immediate right sibling of $h. If $h is the rightmost (or only) child of its parent (or has no parent), then this returns undef.
In list context: returns all the nodes that're the right siblings of $h, starting with the leftmost. If $h is the rightmost (or only) child of its parent (or has no parent), then this returns empty-list.
my $RightNode = $Node->right; my $NextRightNode = $RightNode->right;
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Traversing an HTMLTree with HTML:Element ->right
by wfsp (Abbot) on Jun 30, 2009 at 09:04 UTC | |
by wfsp (Abbot) on Jun 30, 2009 at 09:36 UTC | |
by Anonymous Monk on Jun 30, 2009 at 13:24 UTC | |
|
Re: Traversing an HTMLTree with HTML:Element ->right
by wfsp (Abbot) on Jun 30, 2009 at 14:08 UTC | |
|
Re: Traversing an HTMLTree with HTML:Element ->right
by johnwashburn (Sexton) on Jul 17, 2009 at 11:19 UTC |