#
use strict;
use warnings;
use WWW::Mechanize;
use HTML::TreeBuilder;
use HTML::Entities;
use Data::Dumper;
use XML::Writer;
use Encode;
use GD::Barcode::QRcode;
binmode STDOUT, ":utf8";
our $max_desc=0;
our $max_auth=0;
my %title_list=();
my $total_books=0;
my $starturl ='http://localhost/~homedirectory/BosonBooks/www.bosonbooks.com/boson/fiction/fiction.html';
my $baseurl='http://localhost/~homedirectory/BosonBooks/www.bosonbooks.com/boson/fiction';
my $QRbase='http://www.bosonbooks.com/';
my $DTD= <
END
#print $DTD; #write the DTD at the top
my $writer = new XML::Writer( OUTPUT =>'STDOUT', ENCODING=>'utf-8');
$writer->xmlDecl( 'UTF-8' );
$writer->doctype( 'booklist' );
$writer->startTag('booklist');print "\n";
my $mech= WWW::Mechanize->new();
$mech->get($starturl);
die $mech->response->status_line unless $mech->success;
#print $mech->title, "\n";
my $html=$mech->content;
my @links=$mech->find_all_links(); #get all the links in the page
my @urls=map{$_->[0]} @links;
foreach my $url (@urls){ #walk through them
my $link= $baseurl .'/' .$url;
#print $link . "\n";
if ($link =~ /^$baseurl\/(.*?)\/\1\.html$/){
my $page=WWW::Mechanize->new();
$page->get($link);
if ($page->success) {
$link=~ /(^$baseurl\/(.*?)\/)\2\.html$/;
my $imgbase=$1;
#print STDERR $imgbase ."\n";
my $pagehtml=$page->content;
if ($pagehtml =~ /isbn/){ #if the page contains "isbn" it generates an xml record
unless (defined ($title_list{$link})) {
# #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] (strip invalid xml chars)
#$pagehtml= decode_utf8($pagehtml);
$pagehtml =~ s%
%\n\n%go; #change
to double line breaks
$pagehtml =~ s%()%\\n\n%go;
$pagehtml =~ s/[^\x09\x0A\x0D\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//go;
scrape_page($pagehtml, $link, $imgbase, $writer);
$title_list{$link}=1;
$total_books++;
#print STDERR "WRITING";
}
}
}
}
}
$writer->endTag('booklist');
$writer->end();
print STDERR "max description " . $max_desc . "\n";
print STDERR "max authlength " . $max_auth . "\n";
print STDERR "total written " . $total_books."\n";
sub scrape_page {
#makes a single pass with treebuilder to pull out the metadata.
#and put it into an xml record
#also generates a QR code based on the url and puts the filename in the XML record.
#record is
#
#
#
#
#
#
#
#
#
my $html= $_[0]; #the passed html
my $link=$_[1]; #the passed link
my $baseurl=$_[2]; #the base url for getting the cover image
my $writer=$_[3]; #the xml writer object
# my $writer = new XML::Writer( OUTPUT => 'STDOUT', ENCODING=>'utf-8');
#print $html . "\n";
my $tree= HTML::TreeBuilder->new;
$tree->parse($html);
$writer->startTag('book');print "\n";
$link =~ m%http://localhost/~homedirectory/BosonBooks/(.*)%; #generate the correct URL
my $realurl= 'http://'.$1;
$writer->dataElement("url", $realurl);print "\n";
my $t1=$tree->look_down(
_tag => 'div',
class => 'title'
);
if ($t1) {
my $title=$t1->as_text;
$writer->dataElement("title", $title);print "\n";
my $filename= 'qrdata/'. $title . "QR.png"; #put the qrcodes in "qrdata/filename"
$filename =~s/[ ']//g; #generate the filename for the QR code
qrgen($realurl,$filename); #generate the QR code
$writer->emptyTag("QRCode", 'href'=> "file://". $filename); print "\n"; #store the QR code fname in a tag
} else {
warn "no title! in $baseurl";
}
my $t2=$tree->look_down( # get author
_tag => 'div',
class => 'author'
);
if ($t2) {
my $author=$t2->as_text;
$author =~ m/by (.*)/;
if ($1) {
$writer->dataElement("author", $1);print "\n";
} else {
$writer->dataElement("author", $author);print "\n";
}
} else {
warn "no author! in $baseurl";
}
my $t3=$tree->look_down( # get price
_tag => 'span',
class => 'price'
);
if ($t3) {
my $price=$t3->as_text;
$writer->dataElement("price", $price);print "\n";
}
my $t4=$tree->look_down( # get isbn
_tag => 'span',
class => 'isbn'
);
if ($t4) {
my $isbn=$t4->as_text;
$writer->dataElement("isbn", $isbn);print "\n";
}
my $imageobj=WWW::Mechanize->new(); # new mech object to get the image
my $t5=$tree->look_down( # get filename for cover art
_tag => 'img',
class => 'bookcover'
);
if ($t5) {
my $coverfile=$t5->attr('src');
$imageobj->get($baseurl . $coverfile);
$imageobj->save_content('coverart/'.$coverfile);
$writer->emptyTag("coverfile", 'href'=> "file://coverart/".$coverfile); print "\n"
}
# get the book description
my $t6=$tree->look_down(
_tag => 'div',
class => 'bookdescription'
);
if ($t6) {
my $description=$t6->as_text;
#$description =~ s/ \& / \& /;
if (length($description) > $max_desc) {
$max_desc=length($description);
};
$writer->dataElement("description", $description);print "\n";
}
# get the about the author.
# might want to try to retain formatting (i.e. italic and bold tags)
# need to remove authors website
my $t7=$tree->look_down(
_tag => 'div',
id => 'aboutauthor'
);
if ($t7) {
my $aboutauth=$t7->as_text;
#$aboutauth =~ s/ \& / \&/;
$aboutauth =~ /About the Author(.*)/;
if ($1) {
# print $1 ,"\n";
if (length($1) > $max_auth) {$max_auth=length($1);};
my $aboutauth=$1;
$writer->dataElement("aboutauth", $aboutauth);print "\n";
} else {
if (length($aboutauth) > $max_auth) {
$max_auth=length($aboutauth);
};
$writer->dataElement("aboutauth", $aboutauth);print "\n";
}
}
$writer->endTag('book'); print "\n";print "\n";
$tree->delete;
}
sub qrgen {
# generates a QR code from a url and filename
my $url=$_[0];
my $filename=$_[1];
open FILE, ">", $filename;
print FILE GD::Barcode::QRcode->new($url,
{ Ecc => 'L', Version=>4, ModuleSize => 4}
)->plot->png;
close FILE;
}