#!/usr/local/bin/perl # This program uses the LWP module to read a web page # and store its URL, Title, Description and Keywords in # a database. It then follows the links on the page and # processes any other pages it finds. The following # switches are used to control how the program operates: # /u specify the url to start at # /c continue from the last saved restart point # /f follow links on page and process pages found # /r write next page to process to a restart file # /d update the database # /l produce logfile of crawl # either /u or /c must be specified. # # This is my first attempt at a reasonable program in Perl # so please feel free to criticise to your hearts content # (its the only way I will learn). use HTML::TreeBuilder; use LWP::RobotUA; use URI::URL; use DBI; # Setup user agent for robot $ua = LWP::RobotUA->new('ChilliBot/0.1.' . LWP::Version,'ChilliBot@nextmilltech.co.uk'); $ua->delay(1); print $ua->agent; print "\n"; # check if any arguements entered if(scalar(@ARGV) > 0) { my $startpage=""; local @visitedpages; local $logtofile = 0; local $drilldown = 0; local $usedatabase = 0; local $setforrestart =0; local $continue=0; local $logfilename; $|=1; # set autoflush on for logfile # check parameters and set appropriate variables. for(my $i=0;$i$logfilename") || die "Cannot open logfile $logfilename\n"; close(LOGFILE); } # we do not want to visit pages already visited so keep an array # of their URLs @visitedpages=(); if($usedatabase==1) { # if we are using the database then add all URLs from it # to the list of visited pages. print "Building visited pages list..."; my $DSN = "DSN=PageData"; my $dbh = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n"; my $sql_handle=$dbh->prepare("SELECT PageURL FROM Page") || die $dbh->errstr; $sql_handle->execute() || die $dbh->errstr; while ( @row = $sql_handle->fetchrow_array() ) { push(@visitedpages,$row[0]); } $dbh->disconnect(); print "done\n"; } if($continue==1) { # if we are continuing then find which page to continue from open(CONTINUE,"restartwith.txt") || die "Cannot open restart file\n"; my @continueeurl=; foreach (@continueeurl) { $startpage=$_; } close(CONTINUE); } if($startpage ne "") { &gethtml($startpage); } } else { # No parameters entered so printout the usage information print "Usage:\n"; print " perl robot.pl [/u start_url] [/f] [/d] [/r] [/c] [/s] [/l logfile]\n"; print " where /u - url to start crawl from\n"; print " /f - follow links on each page\n"; print " /d - add page details to database\n"; print " /r - save last accessed url for restart with /c\n"; print " /c - continue from last restart-saved url\n"; print " /l - output to logfile\n\n"; print " note: either /u or /c must be specified\n\n"; } print ("Run Complete\n"); # main routine sub gethtml { my $html; my $treeparse; my $rlink; my @linkarray; my $baseurl; my $pagealreadyvisited; my $pagetoprocess; my $rlinkarray; local $pagetitle =""; local $pagedescription = ""; local $pagekeywords=""; local $pagebaseurl=""; $pagetoprocess = $_[0]; if($setforrestart==1) { # write URL to restart file. open(RESTARTFILE,">restartwith.txt") || die "Cannot open restart file\n"; print RESTARTFILE $pagetoprocess; close(RESTARTFILE); } # check we have not already visited this page $pagealreadyvisited=0; foreach (@visitedpages) { if($_ eq $pagetoprocess) { $pagealreadyvisited=1; } } if ($pagealreadyvisited == 0) { print "Processing: $_[0]..."; # request the page $request = HTTP::Request->new('GET', $_[0]); $response = $ua->request($request); if ($response->is_success) { if($logtofile==1) { open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n"; print LOGFILE "Processing: $_[0]...Response OK\n"; close(LOGFILE); } # parse retrieved HTML @linkarray=(); $html=$response->content; $treeparse=HTML::TreeBuilder->new; $treeparse->parse($html); # extract anchor links $rlinkarray=$treeparse->extract_links("a"); # call treewalker function to check meta tags $treeparse->traverse(\&treewalker); $treeparse->delete(); $pagebaseurl=$response->base; if($logtofile==1) { open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n"; print LOGFILE " Title: $pagetitle\n"; print LOGFILE " Description: $pagedescription\n"; print LOGFILE " Keywords: $pagekeywords\n"; print LOGFILE " Base URL: $pagebaseurl\n"; close(LOGFILE); } if($usedatabase==1) { # write page details to database # DBI::ODBC falls over with any string # longer than 255 if(length($pagetitle)>255) { $pagetitle=substr($pagetitle,0,255); } if(length($pagedescription)>255) { $pagedescription=substr($pagedescription,0,255); } if(length($pagekeywords)>255) { $pagekeywords=substr($pagekeywords,0,255); } my $DSN = "DSN=PageData"; my $dbh = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n"; my $sql_handle=$dbh->prepare(q{ INSERT INTO Page (PageURL, Title, Description,Keywords) VALUES (?, ?, ?, ?) }) || die $dbh->errstr; $sql_handle->execute("$_[0]","$pagetitle","$pagedescription","$pagekeywords") || die $dbh->errstr; $dbh->disconnect(); } # add page to visited pages array push(@visitedpages,$_[0]); print "OK\n"; # convert links from a referenced array to # a normal array foreach $rlink(@$rlinkarray) { push(@linkarray,$$rlink[0]); } # create full URLs from links $baseurl=$response->base; @linkarray = map { $_= url($_, $baseurl)->abs; } @linkarray; foreach (@linkarray) { if($logtofile==1) { open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n"; print LOGFILE " $_\n"; close(LOGFILE); } } # put in seperate loop so that printout is correct foreach (@linkarray) { # if link is http and does not contain # any odd charcters then call this function # recursively passing in the link if (/http:/i) { if (/[#\@\$]/) { } else { if($drilldown == 1) { &gethtml($_); } } } } } else { print "Failed\n"; if($logtofile==1) { open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n"; print LOGFILE "Processing: $_[0]...Failed\n"; close(LOGFILE); } } } } # Used to find title tag, and description and keyword metatags. sub treewalker { my ($node, $start, $depth) = @_; if (ref $node) { my $tag = $node->tag; if ($tag eq "meta") { my $metaname=$node->attr("name"); if ($metaname ne "") { if ($metaname=~ /description/i) { my $description=$node->attr("content"); # remove CR and LF from description. $description =~ s/\n/ /sg; $description =~ s/\r/ /sg; $pagedescription = $description; } if ($metaname=~ /keywords/i) { my $keywords=$node->attr("content"); # remove CR and LF from description. $keywords =~ s/\n/ /sg; $keywords =~ s/\r/ /sg; $pagekeywords = $keywords; } } } if ($tag eq "title" && $pagetitle eq "") { my $contentnodearray=$node->content; foreach my $contentnode(@$contentnodearray) { if (not ref $contentnode) { $pagetitle=$contentnode; } } } } return 1; # This makes it recurse through all sub nodes }