in reply to Traversing an HTMLTree with HTML:Element ->right
#!/usr/bin/perl -w # ******************************************** # TraverseTree.pl # ******************************************** use strict; use Carp; use Switch; use Data::Dumper; use Cwd; use HTTP::Request; use HTTP::Request::Common; use HTTP::Status; use LWP; use LWP::UserAgent; use HTML::TreeBuilder; my $URL_Root = "http://e-familytree.net/"; my $FamilyPageURL = "http://e-familytree.net/F248/F248347.htm"; my %FamilyInfo; my $ua = LWP::UserAgent->new; if (defined $ua) { $ua->timeout(5); my $HTTP_Response = $ua->get($FamilyPageURL); my $HTTP_Status = $HTTP_Response->message ; if ($HTTP_Response->is_success) { my $HTTP_FamilyPage = $HTTP_Response->content; #Set up parser to parse this HTML Page # See: http://search.cpan.org/dist/HTML-Parser/Parser.pm # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/TreeBuild +er.pm # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.p +m # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Tree/Scan +ning.pod # # HTML::TreeBuilder is a subclass of HTML::Parser. # Set the Parser portions to control how the HTML is parsed in +to a tree my $PageAsTree = HTML::TreeBuilder->new(); $PageAsTree->backquote( 1) ; $PageAsTree->empty_element_tags( 1 ) ; $PageAsTree->utf8_mode( 1); $PageAsTree->xml_mode( 1); $PageAsTree->warn( 1); $PageAsTree->ignore_elements(qw(script style)); # Parse HTML into a Tree of HTML::Elements $PageAsTree->parse_content($HTTP_FamilyPage); $PageAsTree->eof; # Simpler reformatting of HTML Tree $PageAsTree->elementify; $PageAsTree->delete_ignorable_whitespace; Normalize_All_Content($PageAsTree); $PageAsTree->objectify_text; my $HtmlHead = $PageAsTree->look_down('_tag', 'head'); my $HtmlBody = $PageAsTree->look_down('_tag', 'body'); print "Tree Found after Initial Tree building.\n"; print "* * * * * * * * * * * * * * * * * * * * * * * * * * * \ +n"; DumpHTMLElement($HtmlBody); print "* * * * * * * * * * * * * * * * * * * * * * * * * * * \ +n"; # Traverse and examine the tree for URL's which are anchored a + location on this page my @AnchorURLs = $HtmlBody->look_down( su +b { + ($_[0]->tag() eq q{a} ) + and ($_[0]->attr(q{href}) =~ m/^\#/) } ); foreach my $URLNode (@AnchorURLs) { $URLNode->delete; } undef @AnchorURLs; # Traverse and examine the tree for relative URL's and convert + them to absolute URL's my @RelativeURLs = $HtmlBody->look_down( su +b { + ($_[0]->tag() eq q{a} ) + and ($_[0]->attr(q{href}) =~ m;\.\.\/;) } ); foreach my $URLNode (@RelativeURLs) { my $OrgHRef = $URLNode->attr(q{href}) ; my $NewHRef = $OrgHRef; $NewHRef =~ s:\.\.\/:$URL_Root:i; $URLNode->attr('href', $NewHRef); } undef @RelativeURLs; # # Restructure the HTML structure of page so it is more to my l +iking. # 1) Everthing to the right of a secTitle until the next secTi +tle is beneath the first/left secTitle # foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { my $SectionTitle = trim(join '|', map ($_->attr('text'), $ +PageSectionNode->look_down(_tag => q{~text}))); $PageSectionNode->attr('~GedComSectionTitle', $SectionTitl +e); #Easier to identify this node foreach my $RightNode ($PageSectionNode->right) { # See if we have hit the next section last if (($RightNode->tag eq q{div}) && ($RightNode->attr('class') eq q{secTitle}) ); my $RightNodeHTML = NormalizeNodeText($RightNode); $RightNode->attr('text', $RightNodeHTML) if ($RightNod +e->tag eq q{~text}); $RightNode->detach; $PageSectionNode->push_content($RightNode); print "Restructure the HTMLTree: Rule #1. \n"; } } # # Restructure the HTML structure of page so it is more to my l +iking. # 2) Within a section title everything between two vCards is b +elongs the left vCard. # foreach my $VCard ($HtmlBody->look_down(_tag => q{div}, class +=> q{vcard})) { foreach my $RightNode ($VCard->right) { last if (($RightNode->tag eq q{div}) && ($RightNode->attr('class') eq q{vcard}) ); my $RightNodeHTML = NormalizeNodeText($RightNode); $RightNode->attr('text', $RightNodeHTML) if ($RightNod +e->tag eq q{~text}); $RightNode->detach; $VCard->push_content($RightNode); print "Restructure the HTMLTree: Rule #1. \n"; } } # # Restructure the HTML structure of page so it is more to my l +iking. # 3) Remove the nodes which are empty text and simple line bre +aks # foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { foreach my $BRNode ($HtmlBody->look_down(_tag => q{br})) { $BRNode->detach; $BRNode->delete; print "Restructure the HTMLTree: Rule #3a. \n"; } foreach my $TextNode ($HtmlBody->look_down(_tag => q{~text +})) { my $TextNodeHTML = NormalizeNodeText($TextNode); if (!$TextNodeHTML) { $TextNode->detach; $TextNode->delete; print "Restructure the HTMLTree: Rule #3b. \n"; } } } # # Restructure the HTML structure of page so it is more to my l +iking. # 4) For multiline nodes of text which have : in them, split i +nto multiple nodes of ~text # foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { my @AllTextNodes = $PageSectionNode->look_down( + sub { + ($_[0]->tag() eq q{~text} ) + and ($_[0]->attr(q{text}) =~ m/:/s) + } ); my $NumTextNodes = @AllTextNodes; foreach my $TextNode (@AllTextNodes) { my $TextNodeHTML = NormalizeNodeText($TextNode); my @AllLinesofText = trim(split /[\n\r]+/s, $TextNodeH +TML); my $NumLines = @AllLinesofText; if (1 < $NumLines ) { my $LineOfText = shift @AllLinesofText; $TextNode->attr(q{text}, $LineOfText); my @NewNodeSequence; foreach $LineOfText (@AllLinesofText) { #print " [$LineOfText].\n") my $NewTextNode = HTML::Element->new(q{~text}) +; $NewTextNode->attr(q{text}, $LineOfText ); push @NewNodeSequence, $NewTextNode; print "Restructure the HTMLTree: Rule #4. \n" +; } if (0 < @NewNodeSequence) { $TextNode->postinsert(@NewNodeSequence); } } if (!$TextNodeHTML) { $TextNode->detach; $TextNode->delete; print "Restructure the HTMLTree: Rule #4b (remove + empties). \n"; } } } # # Restructure the HTML structure of page so it is more to my l +iking. # 5) For nodes of text which have (Name|Mother|Father|wife|Hus +band): in them, move href to the right of them # to be under the text node. foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { my @AllTextNodes = $PageSectionNode->look_down( + sub { + ($_[0]->tag() eq q{~text} ) + and ($_[0]->attr(q{text}) =~ m/^(Name|Mother|Father|Wife|Husba +nd):$/i) + } ); my $NumTextNodes = @AllTextNodes; foreach my $TextNode (@AllTextNodes) { my $TextNodeHTML = NormalizeNodeText($TextNode); my $RightNode = $TextNode->right; if ( $RightNode && (ref($RightNode) =~ m/HTML::Element/i)) { my $RightNodeHTML = NormalizeNodeText($RightNode); if (($RightNode->tag =~ q{a}) && ($RightNode->attr(q{href}) =~ m;http://www;i)) { $RightNode->detach; $TextNode->push_content($RightNode); print "Restructure the HTMLTree: Rule #5. \n" +; } } else { print "ref(\$RightNode) = " . ref($RightNode) . " +\n"; } } } # # Restructure the HTML structure of page so it is more to my l +iking. # 6) For nodes of text which have Other Spouses: in them, move + href to the right to under the text node. # foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { my @AllTextNodes = $PageSectionNode->look_down( + sub { + ($_[0]->tag() eq q{~text} ) + and ($_[0]->attr(q{text}) =~ m/^Other Spouses:$/i) + } ); my $NumTextNodes = @AllTextNodes; foreach my $TextNode (@AllTextNodes) { my $TextNodeHTML = NormalizeNodeText($TextNode); my $RightNode = $TextNode->right; while ( $RightNode && (ref($RightNode) =~ m/HTML::Element/i) && ($RightNode->tag =~ q{a}) && ($RightNode->attr(q{href}) =~ m;http://www;i)) { my $NextNode = $RightNode->right; # Push this node from the right to below $RightNode->detach; # once detached there is no + right node $TextNode->push_content($RightNode); $RightNode = $NextNode; print "Restructure the HTMLTree: Rule #6. \n"; } } } # # Main Traversal of Tree # # Traverse and examine the tree for nodes of Sections of inter +est (Husband, Wife, Children) # searches for Husband, Wife and Children Sections. # # We prune the tree as we go so its structure simplifies as we + go. # print "Main Traversal of the HTML Tree.\n"; foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { my $SectionTitle = $PageSectionNode->attr('~GedComSectionT +itle'); if ($SectionTitle =~ /(Husband|Wife|Children)/i) { print "Processing Section: $SectionTitle. \n"; my @ListOfPeople; foreach my $VCard ($PageSectionNode->look_down(_tag => + q{div}, class => q{vcard})) { my %PersonalInformation; my @ListOfNodes; # process the spans with a class name (e.g. Fn-n, +x-gender, etc.) @ListOfNodes = $VCard->look_down( su +b { + ($_[0]->tag() eq q{span} ) + and ($_[0]->attr(q{class})) } ); foreach my $ClassedSpan (@ListOfNodes) { my $Key = $ClassedSpan->attr(q{class}); my $TextNode = $ClassedSpan->look_down(_tag => + q{~text}); my $Value = ref($TextNode) ? NormalizeNodeText +($TextNode) : No +rmalizeNodeText($ClassedSpan); $PersonalInformation{$Key} = $Value; my $URLNode = $ClassedSpan->look_down(_tag => +q{a}); my $URLText = ref($URLNode) ? $URLNode->attr(q +{href}) : + ""; if ($URLText) { $PersonalInformation{"URL"}{$Key} = $URLTe +xt; } $ClassedSpan->delete; print "Removing a classed span.\n"; } # with the removal above of spans with distinct cl +asses, there may now be spans of no content. # If so, delete these empty nodes. @ListOfNodes = $VCard->look_down( su +b { + ($_[0]->tag() eq q{span} ) + and ($_[0]->attr(q{style}) =~ m/display: ?none/i) } ); foreach my $HiddenSpanNode (@ListOfNodes) { if ($HiddenSpanNode->is_empty) { $HiddenSpanNode->delete; print "Removing a hidden span.\n"; } } # with the removal of the empty spans there may no +w be two, adjacent text nodes # which can be combined. If so, combine them @ListOfNodes = $VCard->look_down( su +b { + ($_[0]->tag eq q{~text} ) + and ($_[0]->attr(q{text}) !~ m/:/i) } ); foreach my $BareTextNode (@ListOfNodes) { my $LeftNode = $BareTextNode->left; if ((ref($LeftNode) =~ m/HTML::Element/i) && ($LeftNode->tag eq q{~text} ) && ($LeftNode->attr(q{text}) =~ m/:/i)) { my $CombinedText = $LeftNode->attr(q{text} +) . " " . $BareTextNode->attr(q{text}); $LeftNode->attr(q{text}, $CombinedText) ; $BareTextNode->delete; print "Combining tow text spans.\n"; } } # Parse text nodes of the form Key: Value # Born: 1687-01-23 @ListOfNodes = $VCard->look_down( su +b { + ($_[0]->tag() eq q{~text} ) + and ($_[0]->attr('text') =~ m/:/i) } ); foreach my $TextNode (@ListOfNodes) { my $LineOfText = trim($TextNode->attr(q{text} +)); $LineOfText =~ s;^([^:]+)(:)(.*)$;$3;si; my $Key = trim($1); my @AllChildNodes = $TextNode->content_list; if (0 == @AllChildNodes) { if ($Key and $LineOfText) { my @AllLines = trim(split /[\n\r]+/s, +$LineOfText); my $NumLines = @AllLines; $PersonalInformation{$Key} = (1 < @All +Lines) ? \@AllLines : $AllLines[0]; } } else { my @ArrayOfValues; foreach my $ChildNode (@AllChildNodes) { if (ref($ChildNode) =~ m/HTML::Element +/i ) { if ($ChildNode->tag =~ q{a}) { my $URL = $ChildNode->attr(q{h +ref}); my @SubNode = $ChildNode->cont +ent_list; my $NameNode = $SubNode[0]; my %PersonInfo; $PersonInfo{q{Name}} = ref($Na +meNode) ? NormalizeNodeText($NameNode) : "??? Uknown ???"; $PersonInfo{q{URL}} = $URL; push @ArrayOfValues, \%PersonI +nfo; } else { my $ErrorText; $ErrorText .= "Expected only a + sequence of <a> tags. Instead received an HTML tag of: \$ChildNode-> +tag = [" . $ChildNode->tag . "].\n"; confess $ErrorText; } } else { my $ErrorText; $ErrorText .= "Expected on HTML:: +Elements and instead received a reference: ref(\$ChildNode) = [" . re +f($ChildNode) . "].\n"; confess $ErrorText; } } $PersonalInformation{$Key} = (1 < @ArrayO +fValues) ? \@ArrayOfValues : $ArrayOfValues[0]; } $TextNode->delete; } if (%PersonalInformation) { push @ListOfPeople, \%PersonalInformation; } } $FamilyInfo{$SectionTitle} = (1 < @ListOfPeople) ? \@L +istOfPeople : $ListOfPeople[0]; } else { print "Unprocessed section, $SectionTitle.\n"; } } print "FamilyInformation Hash:\n" . Dumper(%FamilyInfo) . "\n +"; } } sub Normalize_All_Content { my $Node = shift; if (ref($Node) =~ m/HTML::Element/i) { $Node->normalize_content; foreach my $ChildNode ($Node->content_list) { Normalize_All_Content($ChildNode); } } } sub NormalizeNodeText() { my $Node = shift @_; my $NodeText; if ($Node) { my $NodeReferenceTo = ref($Node); if ($NodeReferenceTo =~ m/HTML::Element/i) { $NodeText = $Node->as_HTML; if ($Node->tag eq q{~text}) { $NodeText = $Node->attr('text'); } $NodeText = trim($NodeText); } else { my $ErrorText; $ErrorText .= "ref(\$Node) = ". ref($Node) . " instead of +HTML::Element.\n"; confess $ErrorText; } } else { my $ErrorText; $ErrorText .= "Argument is undefined, empty, or zero; i.e. ($N +ode) = false.\n"; confess $ErrorText; } return $NodeText; } sub trim { for (@_) { s/^[\s\r\n]+//m; s/\s+$//m; #s/\s+/ /g; } return wantarray?@_:$_[0]; } sub DumpHTMLElement { my $CurrentNode = shift; my $CurrentLevel = shift; my $Padding; for (my $i = 0; $i < $CurrentLevel; ++$i) { $Padding .= " "; } if (ref($CurrentNode) =~ m/HTML::Element/i) { my $NextLevel = $CurrentLevel+1; print "$Padding\{"; my @KeyValuePairs = $CurrentNode->all_attr; while (0 < @KeyValuePairs) { if (($KeyValuePairs[0] =~ /_parent/i) || ($KeyValuePairs[0] =~ /_content/i)) { shift @KeyValuePairs; shift @KeyValuePairs; } else { my $KV_String = "["; $KV_String .= trim($KeyValuePairs[0]); shift @KeyValuePairs; $KV_String .= "= " . trim($KeyValuePairs[0] ). "]"; shift @KeyValuePairs; print $KV_String; } } print "}\n"; foreach my $NextNode ($CurrentNode->content_list) { DumpHTMLElement($NextNode, $NextLevel); } } elsif (! ref($CurrentNode)) { print $Padding . "[" . $CurrentNode . "\n"; } }
|
|---|