#!/usr/bin/perl --
use strict;
use warnings;
use Data::Dumper;
use Web::Scraper;
my $html_content = <<'__HTML__';
<html>
<body>
<div></div>
<div id="wrapper">
<div></div>
<div id="outer">
<div id="inner">
<div></div>
<div id="center">
<div></div>
<div id="main">
<div></div>
<div>
<table id="wrappedcontent">
<tbody class="shnitzel" bgcolor='red'>
<tr>
<td>
<table class="someclass" style="width:508px;" id="Any_20">
<tbody>
<tr> <td><strong>key1</strong></td> <td>val1</td> </tr>
<tr> <td><strong>key2</strong></td> <td>val2</td> </tr>
<tr> <td><strong>key3</strong></td> <td>val3</td> </tr>
<tr> <td><strong>key4</strong></td> <td>val4</td> </tr>
<tr> <td><strong>key5</strong></td> <td>val5</td> </tr>
<tr> <td><strong>key6</strong></td> <td>val6</td> </tr>
<tr> <td><strong>key7</strong></td> <td>val7</td> </tr>
<tr> <td><strong>key8</strong></td> <td>val8</td> </tr>
<tr> <td><strong>key9</strong></td> <td>val9</td> </tr>
<tr> <td><strong>key10</strong></td> <td>val10</td> </tr>
<tr> <td><strong>key11</strong></td> <td>val11</td> </tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
</div>
</div>
</div>
<div id="pm905050">
<p>
<a href="http://example.com/">
<img src="http://example.com/example.png" alt="example alt">
</a>
</p>
</div>
<div id="pm905050-01">
<p>
<a href="http://example.com/01">
<img src="http://example.com/01/example.png" alt="example 01 al
+t">
</a>
</p>
</div>
</body>
</html>
__HTML__
my @pdata;
push @pdata, scraper {
process '//*/table[@class="someclass"]//tr[position() mod 2 = 1]',
'table[]' => scraper {
process '//tr/td[1]', 'name' => 'TEXT';
process '//tr/td[2]', 'attr' => 'TEXT';
};
};
push @pdata, scraper {
process 'div p',
## process '//div/p/*[@href or @src]',
'divp[]' => scraper {
process 'a', 'link' => '@href';
process 'img', 'image' => '@src';
};
};
for my $pagedata ( @pdata ){
my $res = $pagedata->scrape( \$html_content )
or die "Can't define content to parser $!";
print Dumper( $res ), "\n\n";
}
__END__
$VAR1 = {
'table' => [
{
'name' => 'key1',
'attr' => 'val1'
},
{
'name' => 'key3',
'attr' => 'val3'
},
{
'name' => 'key5',
'attr' => 'val5'
},
{
'name' => 'key7',
'attr' => 'val7'
},
{
'name' => 'key9',
'attr' => 'val9'
},
{
'name' => 'key11',
'attr' => 'val11'
}
]
};
$VAR1 = {
'divp' => [
{
'link' => 'http://example.com/',
'image' => 'http://example.com/example.png'
},
{
'link' => 'http://example.com/01',
'image' => 'http://example.com/01/example.png'
}
]
};