use WWW::Mechanize; use URI; use HTML::TokeParser; print "WEB CRAWLER AND HTML EXTRACTOR \n"; print "Please input the URL of the site to be searched \n"; print "Please use a full URL (eg. http://www.dcs.shef.ac.uk/) \n"; #Create an instance of the webcrawler my $webcrawler = WWW::Mechanize->new(); my $url_name = ; # The user inputs the URL to be searched my $uri = URI->new($url_name); # Process the URL and make it a URI #Grab the contents of the URL given by the user $webcrawler->get($uri); #Use the HTML::TokeParser module to extract the contents from the website my @stripped_html; my $x = 0; my $content = $webcrawler->content; my $parser = HTML::TokeParser->new(\$content); while($parser->get_tag){ $stripped_html[0] .= $parser->get_trimmed_text()."\n"; } $stripped_html[0] = join(" ",split " ",$stripped_html[0]); # If we have more than one whitespace in a row leave only 1 print $stripped_html[0]."\n"; my @website_links = $webcrawler->links; # Put the links that exist in the HTML of the URL given by the user in an array $x = $x + 1; #The initial URL is stored in an array from which will be used against the array of URL's to see if a website has been visited before my @visited_urls = ($uri); my @new_uri; while (@website_links) { # While the array still has elements(URL's) check the content for links and strip the HTML if ((grep {$_ eq $website_links[0] } @visited_urls) > 0) { # If the URL has been visited don't visit again shift @website_links; #Remove the URL currently being processed from the list of URL's to visit } else { # If the URL hasn't been visited find the links in its content, add them to the array of URL'S to visit #extract its contents, put them in a string and remove the URL from array of URL's to visit # The next 6 lines of code are in order to initialize the current URL and save the links it has in our array for later proccessing $new_uri[0] = URI->new($website_links[0]); $webcrawler->get($new_uri[0]); my @links = $webcrawler->links($new_uri[0]); push (@website_links,@links); # The URL's that were put in the links array are added to the website_links array splice (@links,0,scalar(@links)); #Delete all the elements of the links array shift(@new_uri); # The following is to extract the HTML off the contents and leave only the text in the same way as done from line 45 onwards $content = $webcrawler->content; $parser = HTML::TokeParser->new(\$content); while($parser->get_tag){ $stripped_html[$x] .= $parser->get_trimmed_text()."\n"; } $stripped_html[$x] = join(" ",split " ",$stripped_html[$x]); push(@visited_urls,$new_uri[0]); #Add the link to the list of already visited URL's $x = $x + 1; shift @website_links; # This will remove the URL that has just been processed and the put the next one in queue ready for processing print $stripped_html[$x]; sleep(10); } }