| Category: | HTML Utility |
| Author/Contact Info | merlyn |
| Description: | Using the XSH language, screen-scrape O'Reilly's "Animals" page, generating a new XML file showing the list organized alphabetically by animals and the covers that use that animal.
From a forthcoming Linux Magazine column of mine. The output looks like:
|
#!/usr/bin/perl
use XML::XSH;
xsh <<'END_XSH';
recovering 1; # for broken entity recovery (a frequent HTML problem)
quiet; # avoid tracing of open
open HTML animals = "http://www.oreilly.com/animals.html";
foreach {1..2} {
foreach //table[not(.//table)
and contains(tr[1]/td[$__], "Book Title")
]/tr[position() > 1] {
# pwd;
$cover = string(td[last()]);
$subject = string(td[last() - 1]);
eval { push @{$cover{$cover}}, $subject; }
}
}
create t1 root;
foreach {sort keys %cover} {
## print "animal $__";
insert element cover into /root;
cd /root/cover[last()];
insert element animal into .;
insert text $__ into animal;
foreach {sort @{$cover{$__}}} {
## print " book $__";
insert element book into .;
insert text $__ into book[last()];
}
}
quiet; # avoid final message from ls
ls /;
END_XSH
|
|
|
|---|