use strict; use HTML::TokeParser::Simple; my @letters = qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z); my $savePath = "C:/temp/quotes.txt"; open (OUT, ">>$savePath"); foreach my $letter (@letters) { my $baseUrl = "http://www.quotationspage.com/quotes/$letter.html"; my $parent_parser = HTML::TokeParser::Simple->new( url => $baseUrl ); my $parent_pr; while ( my $parent_token = $parent_parser->get_token ) { if ( $parent_token->is_tag('div') && $parent_token->get_attr('class') eq 'authorrow' ) { $parent_pr = 1; next; } if ( $parent_pr && $parent_token->is_tag('a') ) { my $authorUrl = "http://www.quotationspage.com" . $parent_token->get_attr('href'); my $author = $parent_token->get_attr('href'); $author =~ /\/quotes\/(.*?)\//; $author = $1; $author =~ s/_/ /g; my $child_parser = HTML::TokeParser::Simple->new( url => $authorUrl ); my $child_pr; my $quote; while ( my $child_token = $child_parser->get_token ) { if ( $child_token->is_tag('dt') && $child_token->get_attr('class') eq 'quote' ) { $child_pr = 1; next; } if ( $child_pr && $child_token->is_text ) { $quote .= $child_token->as_is; next; } else { if ( $child_token->is_end_tag('dt') ) { $child_pr = 0; print "$quote|| $author\n\n"; print OUT "$quote|| $author\n"; $quote = undef; next; } } } } else { if ( $parent_token->is_end_tag('div') ) { $parent_pr = 0; } } } } #### use strict; use HTML::TokeParser::Simple; my @letters = qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z); my $savePath = "C:/temp/quotes.txt"; open (OUT, ">>$savePath"); foreach my $letter (@letters) { my $baseUrl = "http://en.wikipedia.org/wiki/$letter.html"; my $parent_parser = HTML::TokeParser::Simple->new( url => $baseUrl ); my $parent_pr; while ( my $parent_token = $parent_parser->get_token ) { if ( $parent_token->is_tag('div') && $parent_token->get_attr('class') eq 'authorrow' ) { $parent_pr = 1; next; } if ( $parent_pr && $parent_token->is_tag('a') ) { my $authorUrl = "http://en.wikipedia.org/wiki" . $parent_token->get_attr('href'); my $author = $parent_token->get_attr('href'); $author =~ /\/quotes\/(.*?)\//; $author = $1; $author =~ s/_/ /g; my $child_parser = HTML::TokeParser::Simple->new( url => $authorUrl ); my $child_pr; my $quote; while ( my $child_token = $child_parser->get_token ) { if ( $child_token->is_tag('dt') && $child_token->get_attr('class') eq 'quote' ) { $child_pr = 1; next; } if ( $child_pr && $child_token->is_text ) { $quote .= $child_token->as_is; next; } else { if ( $child_token->is_end_tag('dt') ) { $child_pr = 0; print "$quote|| $author\n\n"; print OUT "$quote|| $author\n"; $quote = undef; next; } } } } else { if ( $parent_token->is_end_tag('div') ) { $parent_pr = 0; } } } }