#!/usr/bin/perl/ -w

my $output_folder_html = "/Users/*********html.rtf";

my %follow_urls = ("https://www.dogfoodadvisor.com/dog-food-reviews/brand" => 1);
my $no_of_follow_urls = 1;

my %all_take_urls;
my %already_followed_urls;
my $basis_url = "https://www.dogfoodadvisor.com/dog-food-reviews/";


my $iteration_counter = 0;
while ($no_of_follow_urls > 0) {
	
	# creating output to show progress
	$iteration_counter++;
	my $no_of_take_urls = keys %all_take_urls;
	print "------------------------------------------------------\nIteration $iteration_counter: $no_of_take_urls take-urls found so far!\n";

	# downloading follow-urls
	my %new_follow_urls;
	foreach my $follow_url (keys %follow_urls) {
		print "\nAnalyzing $follow_url ...\n";
		$already_followed_urls{$follow_url} = 1;
		my $html = qx (curl "$follow_url");

		# check each hypertext link within page
		my @html = split(/a href=/, $html);
		foreach my $link (@html) {
			if ($link =~ m/^quedisplay.html\?aTYPE=([0-9]+?)&aPAGE=([0-9]+)/) {
				my $follow_url = $basis_url . "quedisplay.html?aTYPE=" . $1 . "&aPAGE=" . $2;
				$new_follow_urls{$follow_url} = 1;
			}
			elsif ($link =~ m/^quedisplay.html\?aTYPE=([0-9]+)/) {
				my $follow_url = $basis_url . "quedisplay.html?aTYPE=" . $1;
				$new_follow_urls{$follow_url} = 1;
			}
			elsif ($link =~ m/^quereadisplay.html\?0\+([0-9]+)/) {
				my $take_url = $basis_url . "quereadisplay.html?0+" . $1;
				$all_take_urls{$take_url} = 1;
			}
		}
	}

	# check, if new follow urls have been found
	undef (%follow_urls);
	print "\nnew follow links:\n";
	foreach my $follow_url (keys %new_follow_urls) {
		unless (defined $already_followed_urls{$follow_url}) {
			$follow_urls{$follow_url} = 1;
			print "\t$follow_url\n";
		}
	}

	# check number of new follow pages
	$no_of_follow_urls = keys %follow_urls;

}

# download all take-files as html
my $counter = 0;
foreach my $take_url (keys %all_take_urls) {
	my $html = qx (curl "$take_url");

	# saves html to file
	my $output_file = $outputfolder_html . $take_url . ".html";
	open OUT, "> $output_file";
	print OUT "$html";
	close OUT;
}