#!/usr/bin/perl/ -w
my $output_folder_html = "/Users/*********html.rtf";
my %follow_urls = ("https://www.dogfoodadvisor.com/dog-food-reviews/brand" => 1);
my $no_of_follow_urls = 1;
my %all_take_urls;
my %already_followed_urls;
my $basis_url = "https://www.dogfoodadvisor.com/dog-food-reviews/";
my $iteration_counter = 0;
while ($no_of_follow_urls > 0) {
# creating output to show progress
$iteration_counter++;
my $no_of_take_urls = keys %all_take_urls;
print "------------------------------------------------------\nIteration $iteration_counter: $no_of_take_urls take-urls found so far!\n";
# downloading follow-urls
my %new_follow_urls;
foreach my $follow_url (keys %follow_urls) {
print "\nAnalyzing $follow_url ...\n";
$already_followed_urls{$follow_url} = 1;
my $html = qx (curl "$follow_url");
# check each hypertext link within page
my @html = split(/a href=/, $html);
foreach my $link (@html) {
if ($link =~ m/^quedisplay.html\?aTYPE=([0-9]+?)&aPAGE=([0-9]+)/) {
my $follow_url = $basis_url . "quedisplay.html?aTYPE=" . $1 . "&aPAGE=" . $2;
$new_follow_urls{$follow_url} = 1;
}
elsif ($link =~ m/^quedisplay.html\?aTYPE=([0-9]+)/) {
my $follow_url = $basis_url . "quedisplay.html?aTYPE=" . $1;
$new_follow_urls{$follow_url} = 1;
}
elsif ($link =~ m/^quereadisplay.html\?0\+([0-9]+)/) {
my $take_url = $basis_url . "quereadisplay.html?0+" . $1;
$all_take_urls{$take_url} = 1;
}
}
}
# check, if new follow urls have been found
undef (%follow_urls);
print "\nnew follow links:\n";
foreach my $follow_url (keys %new_follow_urls) {
unless (defined $already_followed_urls{$follow_url}) {
$follow_urls{$follow_url} = 1;
print "\t$follow_url\n";
}
}
# check number of new follow pages
$no_of_follow_urls = keys %follow_urls;
}
# download all take-files as html
my $counter = 0;
foreach my $take_url (keys %all_take_urls) {
my $html = qx (curl "$take_url");
# saves html to file
my $output_file = $outputfolder_html . $take_url . ".html";
open OUT, "> $output_file";
print OUT "$html";
close OUT;
}