use strict;
use HTML::TokeParser::Simple;
my @letters = qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z);
my $savePath = "C:/temp/quotes.txt";
open (OUT, ">>$savePath");
foreach my $letter (@letters) {
my $baseUrl = "http://www.quotationspage.com/quotes/$letter.html";
my $parent_parser = HTML::TokeParser::Simple->new( url => $baseUrl );
my $parent_pr;
while ( my $parent_token = $parent_parser->get_token ) {
if ( $parent_token->is_tag('div')
&& $parent_token->get_attr('class') eq 'authorrow' )
{
$parent_pr = 1;
next;
}
if ( $parent_pr && $parent_token->is_tag('a') ) {
my $authorUrl =
"http://www.quotationspage.com" . $parent_token->get_attr('href');
my $author = $parent_token->get_attr('href');
$author =~ /\/quotes\/(.*?)\//;
$author = $1;
$author =~ s/_/ /g;
my $child_parser =
HTML::TokeParser::Simple->new( url => $authorUrl );
my $child_pr;
my $quote;
while ( my $child_token = $child_parser->get_token ) {
if ( $child_token->is_tag('dt')
&& $child_token->get_attr('class') eq 'quote' )
{
$child_pr = 1;
next;
}
if ( $child_pr && $child_token->is_text ) {
$quote .= $child_token->as_is;
next;
}
else {
if ( $child_token->is_end_tag('dt') ) {
$child_pr = 0;
print "$quote|| $author\n\n";
print OUT "$quote|| $author\n";
$quote = undef;
next;
}
}
}
}
else {
if ( $parent_token->is_end_tag('div') ) {
$parent_pr = 0;
}
}
}
}