meanwhile from the heart of the sourcecode:
# takes a macro of regular expressions:
# /foo(bar)=[\w]+(bar1)=[\w]+foo1(bar2)=[\w]+(foo2)/
# with as many foo(s) and bar(s) as you want, where foo(s) and bar(s)
+are
# any regular expression and [\w]+ meaning any sequence of (a..z,A..Z,
+_).
# these will be used as variable name (key in a hash in most cases)
# to store any found data matching the regexp in brackets before
# each =[\w]+ (in this case all bars).
# the key substrings get removed (inc. =), creating a valid regular
# expression. furthermore any (foo) where foo is a defined key in
# %defined_regexps gets replaced by it's value
# brackets can be separated by '|' (pipe) as logical OR and they may b
+e
# nested (should be recursive;-).
# e.g. /(bar)((foo1)|(foo2))=word/ would associate either foo1 or foo2
--- begin code example
#!/usr/bin/perl
use webHarvester;
my @lines = `lynx -source "http://ww.google.com/search?q=anything"`;
use Data::Dumper;
my @vars = (
{
'regexp' => '<p><a href=(URL)=url>(TEXT)=title</a>',
'mapping' => '%',
'map_finish' => '@URLS'
},
{
'regexp' => '<a href=(URL)=next_url><img src=/nav_next\.g
+if',
'logic' => 'FINISH',
},
{
'TRIGGER' => 'BEFORE',
'logic' => 'DO_cut_lines(0,\'<\/script>\');',
},
{
'TRIGGER' => 'BEGIN',
'logic' => 'DO_subst(\'s/<\/?b>//ig\');',
}
);
my $results = $worker->harvest(\@lines, \@vars);
print Dumper($results);
--- end code example |