in reply to Traversing an HTMLTree with HTML:Element ->right

I hate problem discussions where the initial poster has obviously solved the initial problem, but failed to share the solution found. In an effort to NOT be that guy, here is the solution I eventually settled on.

I would especially like to than, WFSP. Among other things, I could not resist replacing my scalar-only version of trim() with your more elegant version of trim().

My main problem was the faulty assumtion that HTML::Element->content_list is an array of references to HTML::Element. It is NOT. Moreover, the documentation for HTML::Element and HTML::TreeBuilder very clearly states that the array is a mix of references and scalars.

If you want all the elements of the HTML::Element->content_list array to be references to HTML::Element, then you need to use the objectify_text() method.

I wanted the genealogical data to all be under the vcard so I could look_down to vcards during the main traversal/extraction. So I decided to re-structure the tree prior to the main traversal/extract. Re-structuring the tree before my main traversal made the main traversal simpler and more robust.

But once you do some pre-traversal re-structuring, there always more to do.

For those interested, here is the solution I settled on

#!/usr/bin/perl -w # ******************************************** # TraverseTree.pl # ******************************************** use strict; use Carp; use Switch; use Data::Dumper; use Cwd; use HTTP::Request; use HTTP::Request::Common; use HTTP::Status; use LWP; use LWP::UserAgent; use HTML::TreeBuilder; my $URL_Root = "http://e-familytree.net/"; my $FamilyPageURL = "http://e-familytree.net/F248/F248347.htm"; my %FamilyInfo; my $ua = LWP::UserAgent->new; if (defined $ua) { $ua->timeout(5); my $HTTP_Response = $ua->get($FamilyPageURL); my $HTTP_Status = $HTTP_Response->message ; if ($HTTP_Response->is_success) { my $HTTP_FamilyPage = $HTTP_Response->content; #Set up parser to parse this HTML Page # See: http://search.cpan.org/dist/HTML-Parser/Parser.pm # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/TreeBuild +er.pm # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.p +m # See: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Tree/Scan +ning.pod # # HTML::TreeBuilder is a subclass of HTML::Parser. # Set the Parser portions to control how the HTML is parsed in +to a tree my $PageAsTree = HTML::TreeBuilder->new(); $PageAsTree->backquote( 1) ; $PageAsTree->empty_element_tags( 1 ) ; $PageAsTree->utf8_mode( 1); $PageAsTree->xml_mode( 1); $PageAsTree->warn( 1); $PageAsTree->ignore_elements(qw(script style)); # Parse HTML into a Tree of HTML::Elements $PageAsTree->parse_content($HTTP_FamilyPage); $PageAsTree->eof; # Simpler reformatting of HTML Tree $PageAsTree->elementify; $PageAsTree->delete_ignorable_whitespace; Normalize_All_Content($PageAsTree); $PageAsTree->objectify_text; my $HtmlHead = $PageAsTree->look_down('_tag', 'head'); my $HtmlBody = $PageAsTree->look_down('_tag', 'body'); print "Tree Found after Initial Tree building.\n"; print "* * * * * * * * * * * * * * * * * * * * * * * * * * * \ +n"; DumpHTMLElement($HtmlBody); print "* * * * * * * * * * * * * * * * * * * * * * * * * * * \ +n"; # Traverse and examine the tree for URL's which are anchored a + location on this page my @AnchorURLs = $HtmlBody->look_down( su +b { + ($_[0]->tag() eq q{a} ) + and ($_[0]->attr(q{href}) =~ m/^\#/) } ); foreach my $URLNode (@AnchorURLs) { $URLNode->delete; } undef @AnchorURLs; # Traverse and examine the tree for relative URL's and convert + them to absolute URL's my @RelativeURLs = $HtmlBody->look_down( su +b { + ($_[0]->tag() eq q{a} ) + and ($_[0]->attr(q{href}) =~ m;\.\.\/;) } ); foreach my $URLNode (@RelativeURLs) { my $OrgHRef = $URLNode->attr(q{href}) ; my $NewHRef = $OrgHRef; $NewHRef =~ s:\.\.\/:$URL_Root:i; $URLNode->attr('href', $NewHRef); } undef @RelativeURLs; # # Restructure the HTML structure of page so it is more to my l +iking. # 1) Everthing to the right of a secTitle until the next secTi +tle is beneath the first/left secTitle # foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { my $SectionTitle = trim(join '|', map ($_->attr('text'), $ +PageSectionNode->look_down(_tag => q{~text}))); $PageSectionNode->attr('~GedComSectionTitle', $SectionTitl +e); #Easier to identify this node foreach my $RightNode ($PageSectionNode->right) { # See if we have hit the next section last if (($RightNode->tag eq q{div}) && ($RightNode->attr('class') eq q{secTitle}) ); my $RightNodeHTML = NormalizeNodeText($RightNode); $RightNode->attr('text', $RightNodeHTML) if ($RightNod +e->tag eq q{~text}); $RightNode->detach; $PageSectionNode->push_content($RightNode); print "Restructure the HTMLTree: Rule #1. \n"; } } # # Restructure the HTML structure of page so it is more to my l +iking. # 2) Within a section title everything between two vCards is b +elongs the left vCard. # foreach my $VCard ($HtmlBody->look_down(_tag => q{div}, class +=> q{vcard})) { foreach my $RightNode ($VCard->right) { last if (($RightNode->tag eq q{div}) && ($RightNode->attr('class') eq q{vcard}) ); my $RightNodeHTML = NormalizeNodeText($RightNode); $RightNode->attr('text', $RightNodeHTML) if ($RightNod +e->tag eq q{~text}); $RightNode->detach; $VCard->push_content($RightNode); print "Restructure the HTMLTree: Rule #1. \n"; } } # # Restructure the HTML structure of page so it is more to my l +iking. # 3) Remove the nodes which are empty text and simple line bre +aks # foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { foreach my $BRNode ($HtmlBody->look_down(_tag => q{br})) { $BRNode->detach; $BRNode->delete; print "Restructure the HTMLTree: Rule #3a. \n"; } foreach my $TextNode ($HtmlBody->look_down(_tag => q{~text +})) { my $TextNodeHTML = NormalizeNodeText($TextNode); if (!$TextNodeHTML) { $TextNode->detach; $TextNode->delete; print "Restructure the HTMLTree: Rule #3b. \n"; } } } # # Restructure the HTML structure of page so it is more to my l +iking. # 4) For multiline nodes of text which have : in them, split i +nto multiple nodes of ~text # foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { my @AllTextNodes = $PageSectionNode->look_down( + sub { + ($_[0]->tag() eq q{~text} ) + and ($_[0]->attr(q{text}) =~ m/:/s) + } ); my $NumTextNodes = @AllTextNodes; foreach my $TextNode (@AllTextNodes) { my $TextNodeHTML = NormalizeNodeText($TextNode); my @AllLinesofText = trim(split /[\n\r]+/s, $TextNodeH +TML); my $NumLines = @AllLinesofText; if (1 < $NumLines ) { my $LineOfText = shift @AllLinesofText; $TextNode->attr(q{text}, $LineOfText); my @NewNodeSequence; foreach $LineOfText (@AllLinesofText) { #print " [$LineOfText].\n") my $NewTextNode = HTML::Element->new(q{~text}) +; $NewTextNode->attr(q{text}, $LineOfText ); push @NewNodeSequence, $NewTextNode; print "Restructure the HTMLTree: Rule #4. \n" +; } if (0 < @NewNodeSequence) { $TextNode->postinsert(@NewNodeSequence); } } if (!$TextNodeHTML) { $TextNode->detach; $TextNode->delete; print "Restructure the HTMLTree: Rule #4b (remove + empties). \n"; } } } # # Restructure the HTML structure of page so it is more to my l +iking. # 5) For nodes of text which have (Name|Mother|Father|wife|Hus +band): in them, move href to the right of them # to be under the text node. foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { my @AllTextNodes = $PageSectionNode->look_down( + sub { + ($_[0]->tag() eq q{~text} ) + and ($_[0]->attr(q{text}) =~ m/^(Name|Mother|Father|Wife|Husba +nd):$/i) + } ); my $NumTextNodes = @AllTextNodes; foreach my $TextNode (@AllTextNodes) { my $TextNodeHTML = NormalizeNodeText($TextNode); my $RightNode = $TextNode->right; if ( $RightNode && (ref($RightNode) =~ m/HTML::Element/i)) { my $RightNodeHTML = NormalizeNodeText($RightNode); if (($RightNode->tag =~ q{a}) && ($RightNode->attr(q{href}) =~ m;http://www;i)) { $RightNode->detach; $TextNode->push_content($RightNode); print "Restructure the HTMLTree: Rule #5. \n" +; } } else { print "ref(\$RightNode) = " . ref($RightNode) . " +\n"; } } } # # Restructure the HTML structure of page so it is more to my l +iking. # 6) For nodes of text which have Other Spouses: in them, move + href to the right to under the text node. # foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { my @AllTextNodes = $PageSectionNode->look_down( + sub { + ($_[0]->tag() eq q{~text} ) + and ($_[0]->attr(q{text}) =~ m/^Other Spouses:$/i) + } ); my $NumTextNodes = @AllTextNodes; foreach my $TextNode (@AllTextNodes) { my $TextNodeHTML = NormalizeNodeText($TextNode); my $RightNode = $TextNode->right; while ( $RightNode && (ref($RightNode) =~ m/HTML::Element/i) && ($RightNode->tag =~ q{a}) && ($RightNode->attr(q{href}) =~ m;http://www;i)) { my $NextNode = $RightNode->right; # Push this node from the right to below $RightNode->detach; # once detached there is no + right node $TextNode->push_content($RightNode); $RightNode = $NextNode; print "Restructure the HTMLTree: Rule #6. \n"; } } } # # Main Traversal of Tree # # Traverse and examine the tree for nodes of Sections of inter +est (Husband, Wife, Children) # searches for Husband, Wife and Children Sections. # # We prune the tree as we go so its structure simplifies as we + go. # print "Main Traversal of the HTML Tree.\n"; foreach my $PageSectionNode ($HtmlBody->look_down(_tag => q{di +v}, class => q{secTitle})) { my $SectionTitle = $PageSectionNode->attr('~GedComSectionT +itle'); if ($SectionTitle =~ /(Husband|Wife|Children)/i) { print "Processing Section: $SectionTitle. \n"; my @ListOfPeople; foreach my $VCard ($PageSectionNode->look_down(_tag => + q{div}, class => q{vcard})) { my %PersonalInformation; my @ListOfNodes; # process the spans with a class name (e.g. Fn-n, +x-gender, etc.) @ListOfNodes = $VCard->look_down( su +b { + ($_[0]->tag() eq q{span} ) + and ($_[0]->attr(q{class})) } ); foreach my $ClassedSpan (@ListOfNodes) { my $Key = $ClassedSpan->attr(q{class}); my $TextNode = $ClassedSpan->look_down(_tag => + q{~text}); my $Value = ref($TextNode) ? NormalizeNodeText +($TextNode) : No +rmalizeNodeText($ClassedSpan); $PersonalInformation{$Key} = $Value; my $URLNode = $ClassedSpan->look_down(_tag => +q{a}); my $URLText = ref($URLNode) ? $URLNode->attr(q +{href}) : + ""; if ($URLText) { $PersonalInformation{"URL"}{$Key} = $URLTe +xt; } $ClassedSpan->delete; print "Removing a classed span.\n"; } # with the removal above of spans with distinct cl +asses, there may now be spans of no content. # If so, delete these empty nodes. @ListOfNodes = $VCard->look_down( su +b { + ($_[0]->tag() eq q{span} ) + and ($_[0]->attr(q{style}) =~ m/display: ?none/i) } ); foreach my $HiddenSpanNode (@ListOfNodes) { if ($HiddenSpanNode->is_empty) { $HiddenSpanNode->delete; print "Removing a hidden span.\n"; } } # with the removal of the empty spans there may no +w be two, adjacent text nodes # which can be combined. If so, combine them @ListOfNodes = $VCard->look_down( su +b { + ($_[0]->tag eq q{~text} ) + and ($_[0]->attr(q{text}) !~ m/:/i) } ); foreach my $BareTextNode (@ListOfNodes) { my $LeftNode = $BareTextNode->left; if ((ref($LeftNode) =~ m/HTML::Element/i) && ($LeftNode->tag eq q{~text} ) && ($LeftNode->attr(q{text}) =~ m/:/i)) { my $CombinedText = $LeftNode->attr(q{text} +) . " " . $BareTextNode->attr(q{text}); $LeftNode->attr(q{text}, $CombinedText) ; $BareTextNode->delete; print "Combining tow text spans.\n"; } } # Parse text nodes of the form Key: Value # Born: 1687-01-23 @ListOfNodes = $VCard->look_down( su +b { + ($_[0]->tag() eq q{~text} ) + and ($_[0]->attr('text') =~ m/:/i) } ); foreach my $TextNode (@ListOfNodes) { my $LineOfText = trim($TextNode->attr(q{text} +)); $LineOfText =~ s;^([^:]+)(:)(.*)$;$3;si; my $Key = trim($1); my @AllChildNodes = $TextNode->content_list; if (0 == @AllChildNodes) { if ($Key and $LineOfText) { my @AllLines = trim(split /[\n\r]+/s, +$LineOfText); my $NumLines = @AllLines; $PersonalInformation{$Key} = (1 < @All +Lines) ? \@AllLines : $AllLines[0]; } } else { my @ArrayOfValues; foreach my $ChildNode (@AllChildNodes) { if (ref($ChildNode) =~ m/HTML::Element +/i ) { if ($ChildNode->tag =~ q{a}) { my $URL = $ChildNode->attr(q{h +ref}); my @SubNode = $ChildNode->cont +ent_list; my $NameNode = $SubNode[0]; my %PersonInfo; $PersonInfo{q{Name}} = ref($Na +meNode) ? NormalizeNodeText($NameNode) : "??? Uknown ???"; $PersonInfo{q{URL}} = $URL; push @ArrayOfValues, \%PersonI +nfo; } else { my $ErrorText; $ErrorText .= "Expected only a + sequence of <a> tags. Instead received an HTML tag of: \$ChildNode-> +tag = [" . $ChildNode->tag . "].\n"; confess $ErrorText; } } else { my $ErrorText; $ErrorText .= "Expected on HTML:: +Elements and instead received a reference: ref(\$ChildNode) = [" . re +f($ChildNode) . "].\n"; confess $ErrorText; } } $PersonalInformation{$Key} = (1 < @ArrayO +fValues) ? \@ArrayOfValues : $ArrayOfValues[0]; } $TextNode->delete; } if (%PersonalInformation) { push @ListOfPeople, \%PersonalInformation; } } $FamilyInfo{$SectionTitle} = (1 < @ListOfPeople) ? \@L +istOfPeople : $ListOfPeople[0]; } else { print "Unprocessed section, $SectionTitle.\n"; } } print "FamilyInformation Hash:\n" . Dumper(%FamilyInfo) . "\n +"; } } sub Normalize_All_Content { my $Node = shift; if (ref($Node) =~ m/HTML::Element/i) { $Node->normalize_content; foreach my $ChildNode ($Node->content_list) { Normalize_All_Content($ChildNode); } } } sub NormalizeNodeText() { my $Node = shift @_; my $NodeText; if ($Node) { my $NodeReferenceTo = ref($Node); if ($NodeReferenceTo =~ m/HTML::Element/i) { $NodeText = $Node->as_HTML; if ($Node->tag eq q{~text}) { $NodeText = $Node->attr('text'); } $NodeText = trim($NodeText); } else { my $ErrorText; $ErrorText .= "ref(\$Node) = ". ref($Node) . " instead of +HTML::Element.\n"; confess $ErrorText; } } else { my $ErrorText; $ErrorText .= "Argument is undefined, empty, or zero; i.e. ($N +ode) = false.\n"; confess $ErrorText; } return $NodeText; } sub trim { for (@_) { s/^[\s\r\n]+//m; s/\s+$//m; #s/\s+/ /g; } return wantarray?@_:$_[0]; } sub DumpHTMLElement { my $CurrentNode = shift; my $CurrentLevel = shift; my $Padding; for (my $i = 0; $i < $CurrentLevel; ++$i) { $Padding .= " "; } if (ref($CurrentNode) =~ m/HTML::Element/i) { my $NextLevel = $CurrentLevel+1; print "$Padding\{"; my @KeyValuePairs = $CurrentNode->all_attr; while (0 < @KeyValuePairs) { if (($KeyValuePairs[0] =~ /_parent/i) || ($KeyValuePairs[0] =~ /_content/i)) { shift @KeyValuePairs; shift @KeyValuePairs; } else { my $KV_String = "["; $KV_String .= trim($KeyValuePairs[0]); shift @KeyValuePairs; $KV_String .= "= " . trim($KeyValuePairs[0] ). "]"; shift @KeyValuePairs; print $KV_String; } } print "}\n"; foreach my $NextNode ($CurrentNode->content_list) { DumpHTMLElement($NextNode, $NextLevel); } } elsif (! ref($CurrentNode)) { print $Padding . "[" . $CurrentNode . "\n"; } }