use WWW::Mechanize;
 use URI;
 use HTML::TokeParser;
 
 print "WEB CRAWLER AND HTML EXTRACTOR \n";
 print "Please input the URL of the site to be searched \n";
 print "Please use a full URL (eg. http://www.dcs.shef.ac.uk/) \n";
 
 #Create an instance of the webcrawler
 my $webcrawler = WWW::Mechanize->new();
  
 my $url_name = <STDIN>; # The user inputs the URL to be searched
 
 my $uri = URI->new($url_name); # Process the URL and make it a URI
 
 #Grab the contents of the URL given by the user
 $webcrawler->get($uri);
 
 #Use the HTML::TokeParser module to extract the contents from the website 
 my @stripped_html;  
 my $x = 0;
 my $content = $webcrawler->content;
 my $parser = HTML::TokeParser->new(\$content);
 while($parser->get_tag){
	$stripped_html[0] .= $parser->get_trimmed_text()."\n";
	} 
 
 $stripped_html[0] = join(" ",split " ",$stripped_html[0]); # If we have more than one whitespace in a row leave only 1
 print $stripped_html[0]."\n";
 my @website_links = $webcrawler->links; # Put the links that exist in the HTML of the URL given by the user in an array 
 $x = $x + 1;
  
 #The initial URL is stored in an array from which will be used against the array of URL's to see if a website has been visited before
 my @visited_urls = ($uri);
 my @new_uri;
    
 while (@website_links) { # While the array still has elements(URL's) check the content for links and strip the HTML 
	
	if ((grep {$_ eq $website_links[0] } @visited_urls) > 0) { # If the URL has been visited don't visit again
		shift @website_links; #Remove the URL currently being processed from the list of URL's to visit  
    }
	else { # If the URL hasn't been visited find the links in its content, add them to the array of URL'S to visit
           #extract its contents, put them in a string and remove the URL from array of URL's to visit
		
		# The next 6 lines of code are in order to initialize the current URL and save the links it has in our array for later proccessing
		$new_uri[0] = URI->new($website_links[0]);
		$webcrawler->get($new_uri[0]);		
		my @links = $webcrawler->links($new_uri[0]);
		push (@website_links,@links); # The URL's that were put in the links array are added to the website_links array
		splice (@links,0,scalar(@links)); #Delete all the elements of the links array
		shift(@new_uri);
		
		# The following is to extract the HTML off the contents and leave only the text in the same way as done from line 45 onwards
		$content = $webcrawler->content;
        $parser = HTML::TokeParser->new(\$content);
			while($parser->get_tag){
				$stripped_html[$x] .= $parser->get_trimmed_text()."\n";
			} 

		$stripped_html[$x] = join(" ",split " ",$stripped_html[$x]);

		push(@visited_urls,$new_uri[0]); #Add the link to the list of already visited URL's	
		$x = $x + 1;
		shift @website_links; # This will remove the URL that has just been processed and the put the next one in queue ready for processing
		print $stripped_html[$x];
		sleep(10);
		}
    }