in reply to Extracting data-structure from HTML using Web::Scraper

Any help with this problem would be appreciated.

One ugly way

#!/usr/bin/perl -- use strict; use warnings; use Web::Scraper; use Data::Dump; my $sample = q{ <html><body> <h4 class="bla">July 12</h4> <p>Tim</p> <p>Jon</p> <h4 class="bla">July 13</h4> <p>James</p> <p>Eric</p> <p>Jerry</p> <p>Susie</p> <h4 class="date">July 14</h4> <p>Kami</p> <p>Darryl</p> </body></html> }; my $names = scraper { process q{//h4 | //p}, 'h4p[]', scraper { process q{//h4}, 'date' , 'text'; process q{//p}, 'person', 'text'; }; }; my $res = $names->scrape( $sample ); dd $res; { my @root; for my $tag ( @{ $res->{h4p} } ){ if( $$tag{date} ){ pop @root; # remove previous key push @root, {}, $$tag{date}; } if( $$tag{person} ){ push @{ $root[-2]->{ $root[-1] # key } } , $$tag{person}; } } pop @root if not ref $root[-1]; dd \@root; } __END__ { h4p => [ { date => "July 12" }, { person => "Tim" }, { person => "Jon" }, { date => "July 13" }, { person => "James" }, { person => "Eric" }, { person => "Jerry" }, { person => "Susie" }, { date => "July 14" }, { person => "Kami" }, { person => "Darryl" }, ], } [ { "July 12" => ["Tim", "Jon"] }, { "July 13" => ["James", "Eric", "Jerry", "Susie"] }, { "July 14" => ["Kami", "Darryl"] }, ]

Replies are listed 'Best First'.
Re^2: Extracting data-structure from HTML using Web::Scraper
by Anonymous Monk on Jul 14, 2012 at 07:27 UTC

    With help from http://cpansearch.perl.org/src/MIYAGAWA/Web-Scraper-0.36/t/04_callback.t a slight improvement

    #!/usr/bin/perl -- use strict; use warnings; use Web::Scraper; use Data::Dump; my $sample = q{ <html><body> <h4 class="bla">July 12</h4> <p>Tim</p> <p>Jon</p> <h4 class="bla">July 13</h4> <p>James</p> <p>Eric</p> <p>Jerry</p> <p>Susie</p> <h4 class="date">July 14</h4> <p>Kami</p> <p>Darryl</p> </body></html> }; sub scrap { my @root; my $names = scraper { process q{//h4 | //p}, sub { if( $_->tag eq 'h4' ){ pop @root; push @root, {}, $_->as_trimmed_text; } if( $_->tag eq 'p' ){ push @{ $root[-2]->{ $root[-1] # key } } , $_->as_trimmed_text; } }; }; $names->scrape( @_ ); pop @root if not ref $root[-1]; return \@root; } dd scrap( \$sample ); __END__ [ { "July 12" => ["Tim", "Jon"] }, { "July 13" => ["James", "Eric", "Jerry", "Susie"] }, { "July 14" => ["Kami", "Darryl"] }, ]
Re^2: Extracting data-structure from HTML using Web::Scraper
by Anonymous Monk on Jul 14, 2012 at 07:41 UTC

    Same with xsh

    The output

    $ xsh --html --quiet --non-interactive --load pm981742.xsh <?xml version="1.0" standalone="yes"?> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http:// +www.w3.org/TR/REC-html40/loose.dtd"> <html> <body> <h4 class="bla">July 12</h4> <p>Tim</p> <p>Jon</p> <h4 class="bla">July 13</h4> <p>James</p> <p>Eric</p> <p>Jerry</p> <p>Susie</p> <h4 class="date">July 14</h4> <p>Kami</p> <p>Darryl</p> </body> </html> { "July 12" => ["Tim", "Jon"], "July 13" => ["James", "Eric", "Jerry", "Susie"], "July 14" => ["Kami", "Darryl"], }

    The xsh script (xml shell script)

    open pm981742.xml; ls --indent /; for //body/* { $text = string(text()); if( name() = "h4" ){ $key = $text; } if( name() = "p" ){ perl { push @{ $hash{$key} }, $text; }; } } perl { use Data::Dump; dd \%hash; undef %hash; undef $key; };

      Since both Web::Scraper and xsh depend on XML::LibXML, you could use straight XML::LibXML, its pretty much like xsh (logic), but perhaps more verbose and less shelly :)

      #!/usr/bin/perl -- use strict; use warnings; use Data::Dump; use XML::LibXML 1.94; my $sample = q{ <html><body> <h4 class="bla">July 12</h4> <p>Tim</p> <p>Jon</p> <h4 class="bla">July 13</h4> <p>James</p> <p>Eric</p> <p>Jerry</p> <p>Susie</p> <h4 class="date">July 14</h4> <p>Kami</p> <p>Darryl</p> </body></html> }; my $xml = XML::LibXML->load_xml(string => $sample ); my @root; for my $element ( $xml->findnodes("//body/*") ){ if( $element->tagName eq 'h4' ){ pop @root; push @root, {}, $element->textContent; } if( $element->tagName eq 'p' ){ push @{ $root[-2]->{ $root[-1] # key } } , $element->textContent; } } pop @root if not ref $root[-1]; dd \@root; __END__ [ { "July 12" => ["Tim", "Jon"] }, { "July 13" => ["James", "Eric", "Jerry", "Susie"] }, { "July 14" => ["Kami", "Darryl"] }, ]
Re^2: Extracting data-structure from HTML using Web::Scraper
by Anonymous Monk on Jul 14, 2012 at 08:27 UTC

    And XML::Twig since the logic is the same

    #!/usr/bin/perl -- use strict; use warnings; use Data::Dump; use XML::Twig; my $sample = q{ <html><body> <h4 class="bla">July 12</h4> <p>Tim</p> <p>Jon</p> <h4 class="bla">July 13</h4> <p>James</p> <p>Eric</p> <p>Jerry</p> <p>Susie</p> <h4 class="date">July 14</h4> <p>Kami</p> <p>Darryl</p> </body></html> }; my @root; my $xml = XML::Twig->new( twig_handlers => { '//body/h4' => sub { dd $_->path; pop @root; push @root, {}, $_->text; }, '//body/p' => sub { dd $_->path; push @{ $root[-2]->{ $root[-1] # key } } , $_->text; }, }, ); $xml->xparse( $sample ); pop @root if not ref $root[-1]; dd \@root; __END__ "/html/body/h4" "/html/body/p" "/html/body/p" "/html/body/h4" "/html/body/p" "/html/body/p" "/html/body/p" "/html/body/p" "/html/body/h4" "/html/body/p" "/html/body/p" [ { "July 12" => ["Tim", "Jon"] }, { "July 13" => ["James", "Eric", "Jerry", "Susie"] }, { "July 14" => ["Kami", "Darryl"] }, ]