use WWW::Mechanize;
use URI;
use HTML::TokeParser;
print "WEB CRAWLER AND HTML EXTRACTOR \n";
print "Please input the URL of the site to be searched \n";
print "Please use a full URL (eg. http://www.dcs.shef.ac.uk/) \n";
#Create an instance of the webcrawler
my $webcrawler = WWW::Mechanize->new();
my $url_name = ; # The user inputs the URL to be searched
my $uri = URI->new($url_name); # Process the URL and make it a URI
#Grab the contents of the URL given by the user
$webcrawler->get($uri);
#Use the HTML::TokeParser module to extract the contents from the website
my @stripped_html;
my $x = 0;
my $content = $webcrawler->content;
my $parser = HTML::TokeParser->new(\$content);
while($parser->get_tag){
$stripped_html[0] .= $parser->get_trimmed_text()."\n";
}
$stripped_html[0] = join(" ",split " ",$stripped_html[0]); # If we have more than one whitespace in a row leave only 1
print $stripped_html[0]."\n";
my @website_links = $webcrawler->links; # Put the links that exist in the HTML of the URL given by the user in an array
$x = $x + 1;
#The initial URL is stored in an array from which will be used against the array of URL's to see if a website has been visited before
my @visited_urls = ($uri);
my @new_uri;
while (@website_links) { # While the array still has elements(URL's) check the content for links and strip the HTML
if ((grep {$_ eq $website_links[0] } @visited_urls) > 0) { # If the URL has been visited don't visit again
shift @website_links; #Remove the URL currently being processed from the list of URL's to visit
}
else { # If the URL hasn't been visited find the links in its content, add them to the array of URL'S to visit
#extract its contents, put them in a string and remove the URL from array of URL's to visit
# The next 6 lines of code are in order to initialize the current URL and save the links it has in our array for later proccessing
$new_uri[0] = URI->new($website_links[0]);
$webcrawler->get($new_uri[0]);
my @links = $webcrawler->links($new_uri[0]);
push (@website_links,@links); # The URL's that were put in the links array are added to the website_links array
splice (@links,0,scalar(@links)); #Delete all the elements of the links array
shift(@new_uri);
# The following is to extract the HTML off the contents and leave only the text in the same way as done from line 45 onwards
$content = $webcrawler->content;
$parser = HTML::TokeParser->new(\$content);
while($parser->get_tag){
$stripped_html[$x] .= $parser->get_trimmed_text()."\n";
}
$stripped_html[$x] = join(" ",split " ",$stripped_html[$x]);
push(@visited_urls,$new_uri[0]); #Add the link to the list of already visited URL's
$x = $x + 1;
shift @website_links; # This will remove the URL that has just been processed and the put the next one in queue ready for processing
print $stripped_html[$x];
sleep(10);
}
}