#!/usr/bin/perl # bosonbooks_scrape.pl # # scraper to scrape a local dump of the boson books website # and extract out the information for each of the books # gets the URL # generates a QR code of the URL # gets the title:
# gets the author:
# gets the price: (if there) # gets the ISBN: # gets the book cover location: # gets the description:
# gets the about the author:
# #!/usr/bin/perl # bosonbooks_scrape.pl # # scraper to scrape a local dump of the boson books website # and extract out the information for each of the books # gets the URL # generates a QR code of the URL # gets the title:
# gets the author:
# gets the price: (if there) # gets the ISBN: # gets the book cover location: # gets the description:
# gets the about the author:
# use strict; use warnings; use WWW::Mechanize; use HTML::TreeBuilder; use HTML::Entities; use Data::Dumper; use XML::Writer; use Encode; use GD::Barcode::QRcode; binmode STDOUT, ":utf8"; our $max_desc=0; our $max_auth=0; my %title_list=(); my $total_books=0; my $starturl ='http://localhost/~homedirectory/BosonBooks/www.bosonbooks.com/boson/fiction/fiction.html'; my $baseurl='http://localhost/~homedirectory/BosonBooks/www.bosonbooks.com/boson/fiction'; my $QRbase='http://www.bosonbooks.com/'; my $DTD= < END #print $DTD; #write the DTD at the top my $writer = new XML::Writer( OUTPUT =>'STDOUT', ENCODING=>'utf-8'); $writer->xmlDecl( 'UTF-8' ); $writer->doctype( 'booklist' ); $writer->startTag('booklist');print "\n"; my $mech= WWW::Mechanize->new(); $mech->get($starturl); die $mech->response->status_line unless $mech->success; #print $mech->title, "\n"; my $html=$mech->content; my @links=$mech->find_all_links(); #get all the links in the page my @urls=map{$_->[0]} @links; foreach my $url (@urls){ #walk through them my $link= $baseurl .'/' .$url; #print $link . "\n"; if ($link =~ /^$baseurl\/(.*?)\/\1\.html$/){ my $page=WWW::Mechanize->new(); $page->get($link); if ($page->success) { $link=~ /(^$baseurl\/(.*?)\/)\2\.html$/; my $imgbase=$1; #print STDERR $imgbase ."\n"; my $pagehtml=$page->content; if ($pagehtml =~ /isbn/){ #if the page contains "isbn" it generates an xml record unless (defined ($title_list{$link})) { # #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] (strip invalid xml chars) #$pagehtml= decode_utf8($pagehtml); $pagehtml =~ s%
%\n\n%go; #change
to double line breaks $pagehtml =~ s%(

)%\\n\n%go; $pagehtml =~ s/[^\x09\x0A\x0D\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//go; scrape_page($pagehtml, $link, $imgbase, $writer); $title_list{$link}=1; $total_books++; #print STDERR "WRITING"; } } } } } $writer->endTag('booklist'); $writer->end(); print STDERR "max description " . $max_desc . "\n"; print STDERR "max authlength " . $max_auth . "\n"; print STDERR "total written " . $total_books."\n"; sub scrape_page { #makes a single pass with treebuilder to pull out the metadata. #and put it into an xml record #also generates a QR code based on the url and puts the filename in the XML record. #record is # # # <author></author> # <description></description> # <coverart href="filename"/> # <isbn></isbn> # <url></url> # <QRCode href="QRfilename"/> # <book> my $html= $_[0]; #the passed html my $link=$_[1]; #the passed link my $baseurl=$_[2]; #the base url for getting the cover image my $writer=$_[3]; #the xml writer object # my $writer = new XML::Writer( OUTPUT => 'STDOUT', ENCODING=>'utf-8'); #print $html . "\n"; my $tree= HTML::TreeBuilder->new; $tree->parse($html); $writer->startTag('book');print "\n"; $link =~ m%http://localhost/~homedirectory/BosonBooks/(.*)%; #generate the correct URL my $realurl= 'http://'.$1; $writer->dataElement("url", $realurl);print "\n"; my $t1=$tree->look_down( _tag => 'div', class => 'title' ); if ($t1) { my $title=$t1->as_text; $writer->dataElement("title", $title);print "\n"; my $filename= 'qrdata/'. $title . "QR.png"; #put the qrcodes in "qrdata/filename" $filename =~s/[ ']//g; #generate the filename for the QR code qrgen($realurl,$filename); #generate the QR code $writer->emptyTag("QRCode", 'href'=> "file://". $filename); print "\n"; #store the QR code fname in a tag } else { warn "no title! in $baseurl"; } my $t2=$tree->look_down( # get author _tag => 'div', class => 'author' ); if ($t2) { my $author=$t2->as_text; $author =~ m/by (.*)/; if ($1) { $writer->dataElement("author", $1);print "\n"; } else { $writer->dataElement("author", $author);print "\n"; } } else { warn "no author! in $baseurl"; } my $t3=$tree->look_down( # get price _tag => 'span', class => 'price' ); if ($t3) { my $price=$t3->as_text; $writer->dataElement("price", $price);print "\n"; } my $t4=$tree->look_down( # get isbn _tag => 'span', class => 'isbn' ); if ($t4) { my $isbn=$t4->as_text; $writer->dataElement("isbn", $isbn);print "\n"; } my $imageobj=WWW::Mechanize->new(); # new mech object to get the image my $t5=$tree->look_down( # get filename for cover art _tag => 'img', class => 'bookcover' ); if ($t5) { my $coverfile=$t5->attr('src'); $imageobj->get($baseurl . $coverfile); $imageobj->save_content('coverart/'.$coverfile); $writer->emptyTag("coverfile", 'href'=> "file://coverart/".$coverfile); print "\n" } # get the book description my $t6=$tree->look_down( _tag => 'div', class => 'bookdescription' ); if ($t6) { my $description=$t6->as_text; #$description =~ s/ \& / \& /; if (length($description) > $max_desc) { $max_desc=length($description); }; $writer->dataElement("description", $description);print "\n"; } # get the about the author. # might want to try to retain formatting (i.e. italic and bold tags) # need to remove authors website my $t7=$tree->look_down( _tag => 'div', id => 'aboutauthor' ); if ($t7) { my $aboutauth=$t7->as_text; #$aboutauth =~ s/ \& / \&/; $aboutauth =~ /About the Author(.*)/; if ($1) { # print $1 ,"\n"; if (length($1) > $max_auth) {$max_auth=length($1);}; my $aboutauth=$1; $writer->dataElement("aboutauth", $aboutauth);print "\n"; } else { if (length($aboutauth) > $max_auth) { $max_auth=length($aboutauth); }; $writer->dataElement("aboutauth", $aboutauth);print "\n"; } } $writer->endTag('book'); print "\n";print "\n"; $tree->delete; } sub qrgen { # generates a QR code from a url and filename my $url=$_[0]; my $filename=$_[1]; open FILE, ">", $filename; print FILE GD::Barcode::QRcode->new($url, { Ecc => 'L', Version=>4, ModuleSize => 4} )->plot->png; close FILE; }