#!/usr/bin/perl -w
#
# ********************************************
use strict;
use Carp;
use Switch;
use Data::Dumper;
use Cwd;
use HTTP::Request;
use HTTP::Request::Common;
use HTTP::Status;
use LWP;
use LWP::UserAgent;
use HTML::TreeBuilder;
my $FamilyPageURL = "http://e-familytree.net/F248/F248347.htm";
my $ua = LWP::UserAgent->new;
if (defined $ua)
{
$ua->timeout(5);
my $HTTP_Response = $ua->get($FamilyPageURL);
my $HTTP_Status = $HTTP_Response->message ;
if ($HTTP_Response->is_success)
{
my $HTTP_FamilyPage = $HTTP_Response->content;
#Set up parser to parse this HTML Page
# See: http://search.cpan.org/dist/HTML-Parser/Parser.pm
# See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/TreeBuilder.pm
# See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.pm
# See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Tree/Scanning.pod
#
# HTML::TreeBuilder is a subclass of HTML::Parser.
# Set the Parser portions to control how the HTML is parsed into a tree
my $PageAsTree = HTML::TreeBuilder->new();
$PageAsTree->backquote( 1) ;
$PageAsTree->empty_element_tags( 1 ) ;
$PageAsTree->utf8_mode( 1);
$PageAsTree->xml_mode( 1);
$PageAsTree->warn( 1);
$PageAsTree->ignore_elements(qw(script style));
# Parse the HTML is parsed into a tree, signal the end of processin and remove the ability to parse more
$PageAsTree->parse_content($HTTP_FamilyPage);
$PageAsTree->elementify() ;
$PageAsTree->normalize_content();
my $HtmlHead = $PageAsTree->look_down('_tag', 'head');
my $HtmlBody = $PageAsTree->look_down('_tag', 'body');
# Traverse and examine the tree for Sections of the page (Husband, Wife, Children, Notes)
my @PageSections = $HtmlBody->look_down( sub {
return (($_[0]->tag() eq 'div' )
and ($_[0]->attr('class') =~ m/^secTitle$/i));
}
); # searches for Husband, Wife and Children Sections.
# transfer the HTML to the right of this node of HTML up until the next Section Title is found
my %FamilyMember;
foreach my $Node (@PageSections)
{
my $SectionTitle = $Node->content->[0];
my @SectionNodes = ();
my @NodesOnRight = $Node->right;
my $NumNodesOnTheRight = @NodesOnRight;
print "\$NumNodesOnTheRight = $NumNodesOnTheRight\n @NodesOnRight\n";
my $RightNode1 = $Node->right;
print "The node to the right of " . $Node->as_HTML . "is [" . $RightNode1 . "]";
print "[" . $RightNode1->as_HTML . "]";
print "\n";
for( my $RightNode = $Node->right;
(defined $RightNode) && ($RightNode->attr('class') !~ m/^secTitle$/i);
$RightNode = $RightNode->right)
{
push @SectionNodes, $RightNode;
}
$FamilyMember{$SectionTitle} = \@SectionNodes;
}
}
}
####
for( my $RightNode = $Node->right;
(defined $RightNode) &&
($RightNode->attr('class') !~ m/^secTitle$/i);
$RightNode = $RightNode->right)
{
}
####
my @NodesOnRight = $Node->right;
my $NumNodesOnTheRight = @NodesOnRight;
print "\$NumNodesOnTheRight = $NumNodesOnTheRight\n @NodesOnRight\n";
my $RightNode1 = $Node->right;
print "The node to the right of " . $Node->as_HTML . "is [" . $RightNode1 . "]";
print "[" . $RightNode1->as_HTML . "]";
print "\n";
####
$NumNodesOnTheRight = 51
HTML::Element=HASH(0x30f5344)
Other Spouses: HTML::Element=HASH(0x30f86d4) HTML::Element=HASH(0x30f8724)
Father: HTML::Element=HASH(0x30f87a4)
Mother: HTML::Element=HASH(0x30f8834)
HTML::Element=HASH(0x30f8894) HTML::Element=HASH(0x30f8904)
HTML::Element=HASH(0x30f89f4)
Born:
Died: Bef 21 Nov 1717
Father: HTML::Element=HASH(0x30f8bb4)
Mother: HTML::Element=HASH(0x30f8c44)
HTML::Element=HASH(0x30f8ca4) HTML::Element=HASH(0x30f8d14)
HTML::Element=HASH(0x30f8e04)
Born: Bet 1695 and 1723
Died: Bet 1748 and 1808
Wife: HTML::Element=HASH(0x30f8f44)
HTML::Element=HASH(0x30f8fa4)
HTML::Element=HASH(0x30f9084)
Born: Abt 1702
Died: 14 Oct 1783 at Bridgewater, Plymouth, MA
Husband: HTML::Element=HASH(0x30f91c4)
HTML::Element=HASH(0x30f9224)
HTML::Element=HASH(0x30f9304)
Born: Abt 1704
Died:
HTML::Element=HASH(0x30f93a4)
HTML::Element=HASH(0x30f9484)
Born: Bef 1710
Died: 1793 at Marlborough, MA
Husband: HTML::Element=HASH(0x30fcfc4)
HTML::Element=HASH(0x30fd024)
HTML::Element=HASH(0x30fd104)
Born: Abt 1710
Died:
Husband: HTML::Element=HASH(0x30fd234)
HTML::Element=HASH(0x30fd294) HTML::Element=HASH(0x30fd304)
The node to the right of HUSBAND
is [
]
Can't call method "as_HTML" without a package or object reference at TraverseTree.pl line 75.
####
my $RightNode = $Node->right;
my $NextRightNode = $RightNode->right;