Re: Extracting data-structure from HTML using Web::Scraper

Any help with this problem would be appreciated.

One ugly way

#!/usr/bin/perl --
use strict; use warnings;
use Web::Scraper;
use Data::Dump;

my $sample = q{
<html><body>
    <h4 class="bla">July 12</h4>
    <p>Tim</p>
    <p>Jon</p>
    <h4 class="bla">July 13</h4>
    <p>James</p>
    <p>Eric</p>
    <p>Jerry</p>
    <p>Susie</p>
    <h4 class="date">July 14</h4>
    <p>Kami</p>
    <p>Darryl</p>
</body></html>
};

my $names = scraper {
    process q{//h4 | //p}, 'h4p[]', scraper {
        process q{//h4}, 'date' , 'text';
        process q{//p}, 'person', 'text';
    };
};

my $res = $names->scrape( $sample );
dd $res;

{
    my @root;
    for my $tag ( @{ $res->{h4p} } ){
        if( $$tag{date} ){
            pop @root; # remove previous key
            push @root, {}, $$tag{date};
        }
        if( $$tag{person} ){
            push @{
                $root[-2]->{
                    $root[-1] # key
                }
            } , $$tag{person};
        }
    }
    pop @root if not ref $root[-1];
    dd \@root;
}
__END__
{
  h4p => [
           { date => "July 12" },
           { person => "Tim" },
           { person => "Jon" },
           { date => "July 13" },
           { person => "James" },
           { person => "Eric" },
           { person => "Jerry" },
           { person => "Susie" },
           { date => "July 14" },
           { person => "Kami" },
           { person => "Darryl" },
         ],
}
[
  { "July 12" => ["Tim", "Jon"] },
  { "July 13" => ["James", "Eric", "Jerry", "Susie"] },
  { "July 14" => ["Kami", "Darryl"] },
]
[download]

Comment on Re: Extracting data-structure from HTML using Web::Scraper Download Code

Replies are listed 'Best First'.
Re^2: Extracting data-structure from HTML using Web::Scraper by Anonymous Monk on Jul 14, 2012 at 07:27 UTC
With help from http://cpansearch.perl.org/src/MIYAGAWA/Web-Scraper-0.36/t/04_callback.t a slight improvement #!/usr/bin/perl -- use strict; use warnings; use Web::Scraper; use Data::Dump; my $sample = q{ <html><body> <h4 class="bla">July 12</h4> <p>Tim</p> <p>Jon</p> <h4 class="bla">July 13</h4> <p>James</p> <p>Eric</p> <p>Jerry</p> <p>Susie</p> <h4 class="date">July 14</h4> <p>Kami</p> <p>Darryl</p> </body></html> }; sub scrap { my @root; my $names = scraper { process q{//h4 \| //p}, sub { if( $_->tag eq 'h4' ){ pop @root; push @root, {}, $_->as_trimmed_text; } if( $_->tag eq 'p' ){ push @{ $root[-2]->{ $root[-1] # key } } , $_->as_trimmed_text; } }; }; $names->scrape( @_ ); pop @root if not ref $root[-1]; return \@root; } dd scrap( \$sample ); __END__ [ { "July 12" => ["Tim", "Jon"] }, { "July 13" => ["James", "Eric", "Jerry", "Susie"] }, { "July 14" => ["Kami", "Darryl"] }, ] [download]	[reply] [d/l]
Re^2: Extracting data-structure from HTML using Web::Scraper by Anonymous Monk on Jul 14, 2012 at 07:41 UTC
Same with xsh The output $ xsh --html --quiet --non-interactive --load pm981742.xsh <?xml version="1.0" standalone="yes"?> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http:// +www.w3.org/TR/REC-html40/loose.dtd"> <html> <body> <h4 class="bla">July 12</h4> <p>Tim</p> <p>Jon</p> <h4 class="bla">July 13</h4> <p>James</p> <p>Eric</p> <p>Jerry</p> <p>Susie</p> <h4 class="date">July 14</h4> <p>Kami</p> <p>Darryl</p> </body> </html> { "July 12" => ["Tim", "Jon"], "July 13" => ["James", "Eric", "Jerry", "Susie"], "July 14" => ["Kami", "Darryl"], } [download] The xsh script (xml shell script) `open pm981742.xml; ls --indent /; for //body/* { $text = string(text()); if( name() = "h4" ){ $key = $text; } if( name() = "p" ){ perl { push @{ $hash{$key} }, $text; }; } } perl { use Data::Dump; dd \%hash; undef %hash; undef $key; };` [download]	[reply] [d/l] [select]
Re^3: Extracting data-structure from HTML using Web::Scraper by Anonymous Monk on Jul 14, 2012 at 07:58 UTC
Since both Web::Scraper and xsh depend on XML::LibXML, you could use straight XML::LibXML, its pretty much like xsh (logic), but perhaps more verbose and less shelly :) #!/usr/bin/perl -- use strict; use warnings; use Data::Dump; use XML::LibXML 1.94; my $sample = q{ <html><body> <h4 class="bla">July 12</h4> <p>Tim</p> <p>Jon</p> <h4 class="bla">July 13</h4> <p>James</p> <p>Eric</p> <p>Jerry</p> <p>Susie</p> <h4 class="date">July 14</h4> <p>Kami</p> <p>Darryl</p> </body></html> }; my $xml = XML::LibXML->load_xml(string => $sample ); my @root; for my $element ( $xml->findnodes("//body/*") ){ if( $element->tagName eq 'h4' ){ pop @root; push @root, {}, $element->textContent; } if( $element->tagName eq 'p' ){ push @{ $root[-2]->{ $root[-1] # key } } , $element->textContent; } } pop @root if not ref $root[-1]; dd \@root; __END__ [ { "July 12" => ["Tim", "Jon"] }, { "July 13" => ["James", "Eric", "Jerry", "Susie"] }, { "July 14" => ["Kami", "Darryl"] }, ] [download]	[reply] [d/l]
Re^2: Extracting data-structure from HTML using Web::Scraper by Anonymous Monk on Jul 14, 2012 at 08:27 UTC
And XML::Twig since the logic is the same #!/usr/bin/perl -- use strict; use warnings; use Data::Dump; use XML::Twig; my $sample = q{ <html><body> <h4 class="bla">July 12</h4> <p>Tim</p> <p>Jon</p> <h4 class="bla">July 13</h4> <p>James</p> <p>Eric</p> <p>Jerry</p> <p>Susie</p> <h4 class="date">July 14</h4> <p>Kami</p> <p>Darryl</p> </body></html> }; my @root; my $xml = XML::Twig->new( twig_handlers => { '//body/h4' => sub { dd $_->path; pop @root; push @root, {}, $_->text; }, '//body/p' => sub { dd $_->path; push @{ $root[-2]->{ $root[-1] # key } } , $_->text; }, }, ); $xml->xparse( $sample ); pop @root if not ref $root[-1]; dd \@root; __END__ "/html/body/h4" "/html/body/p" "/html/body/p" "/html/body/h4" "/html/body/p" "/html/body/p" "/html/body/p" "/html/body/p" "/html/body/h4" "/html/body/p" "/html/body/p" [ { "July 12" => ["Tim", "Jon"] }, { "July 13" => ["James", "Eric", "Jerry", "Susie"] }, { "July 14" => ["Kami", "Darryl"] }, ] [download]	[reply] [d/l]