use WWW::Mechanize; use URI; use HTML::TokeParser; print "WEB CRAWLER AND HTML EXTRACTOR \n"; print "Please input the URL of the site to be searched \n"; print "Please use a full URL (eg. http://www.dcs.shef.ac.uk/) \n"; #Create an instance of the webcrawler my $webcrawler = WWW::Mechanize->new(); my $url_name = ; # The user inputs the URL to be searched my $uri = URI->new($url_name); # Process the URL and make it a URI #Grab the contents of the URL given by the user $webcrawler->get($uri); #Use the HTML::TokeParser module to extract the contents from the website my @stripped_html; my $x = 0; my $content = $webcrawler->content; my $parser = HTML::TokeParser->new(\$content); while($parser->get_tag){ $stripped_html[0] .= $parser->get_trimmed_text()."\n"; } $stripped_html[0] = join(" ",split " ",$stripped_html[0]); # If we have more than one whitespace in a row leave only 1 print $stripped_html[0]."\n"; my @website_links = $webcrawler->links; # Put the links that exist in the HTML of the URL given by the user in an array $x = $x + 1; #The initial URL is stored in an array from which will be used against the array of URL's to see if a website has been visited before my @visited_urls = ($uri); my @new_uri; while (@website_links) { # While the array still has elements(URL's) check the content for links and strip the HTML if ((grep {$_ eq $website_links[0] } @visited_urls) > 0) { # If the URL has been visited don't visit again shift @website_links; #Remove the URL currently being processed from the list of URL's to visit } else { # If the URL hasn't been visited find the links in its content, add them to the array of URL'S to visit #extract its contents, put them in a string and remove the URL from array of URL's to visit # The next 6 lines of code are in order to initialize the current URL and save the links it has in our array for later proccessing $new_uri[0] = URI->new($website_links[0]); $webcrawler->get($new_uri[0]); my @links = $webcrawler->links($new_uri[0]); push (@website_links,@links); # The URL's that were put in the links array are added to the website_links array splice (@links,0,scalar(@links)); #Delete all the elements of the links array shift(@new_uri); # The following is to extract the HTML off the contents and leave only the text in the same way as done from line 45 onwards $content = $webcrawler->content; $parser = HTML::TokeParser->new(\$content); while($parser->get_tag){ $stripped_html[$x] .= $parser->get_trimmed_text()."\n"; } $stripped_html[$x] = join(" ",split " ",$stripped_html[$x]); push(@visited_urls,$new_uri[0]); #Add the link to the list of already visited URL's $x = $x + 1; shift @website_links; # This will remove the URL that has just been processed and the put the next one in queue ready for processing print $stripped_html[$x]; sleep(10); } } #### Use of uninitialized value in split at NewWebC.pl line 83, line 1. Use of uninitialized value in print at NewWebC.pl line 88, line 1. Use of uninitialized value in string eq at NewWebC.pl line 63, line 1. Use of uninitialized value in split at NewWebC.pl line 83, line 1. Use of uninitialized value in print at NewWebC.pl line 88, line 1.