nuts has asked for the wisdom of the Perl Monks concerning the following question:
#!/usr/bin/perl require LWP::UserAgent; require HTTP::Request; require HTTP::Response; use HTTP::Request::Common; foreach (@ARGV) { if ( $_ eq $ARGV[0] ) { $inputfile = $_; } elsif ( $_ eq $ARGV[1] ) { $outdir = $ARGV[1]; } else { die "Usage: $0 inputfile outdir\n"; } } print "Welcome\n"; print "Opening inputfile... "; open (LINKFILE,"$inputfile") or die "Couldn't open the inputfile, $!"; @links = <LINKFILE>; close(LINKFILE); print "Sucess!\n"; # unless (-e $outdir){ # print "Directory doesn't exist... Creating\n"; # mkdir "$outdir", 755 or die "Couldn't make directory, $!"; # } if(!opendir (OUTDIR, "$outdir")){ mkdir "$outdir",755; print "Output directory created!\n"; } else{print "Output directory exists!\n";} print "Changing directory... "; chdir "$outdir" or die "Couldn't change directory, $!"; print "Success!\n"; # Check to see if we hung up last time # this doesn't resume, just warns you that it stopped somewhere # in earlier versions of the program i had problems with the # program hanging, but I don't know why. if (-e "spiderlog.txt"){ open (LOG,"spiderlog.txt"); @spiderlog = reverse <LOG>; close(LOG); $lastline = chomp($spiderlog[0]); if ($lastline ne "Done"){ print "Spider not finished... Last line in log says: $lastline +\n"; } } $filenum = 1; $ua = new LWP::UserAgent; $ua->agent('ChrisBot/1.0'); print "Start spidering process...\n\n"; $total = @links; $start = time(); open (LOG,">>spiderlog.txt"); print LOG "Started at: $start\n\n"; foreach $line (@links){ print "Getting $line"; $response = $ua->request(GET $line); if ($response->is_success) { $content = $response->content; if ($filenum =~ /\d\d\d\d/) {$filenum = $filenum; } elsif ($filenum =~ /\d\d\d/) {$filenum = "0$filenum"; } elsif ($filenum =~ /\d\d/) {$filenum = "00$filenum"; } else {$filenum = "000$filenum"; } open (NEWPAGE,">$filenum.html"); print NEWPAGE $response->content; close (NEWPAGE); print "$filenum.html generated\n\n"; print LOG "$filenum - $line"; $filenum++; } else { print $response->error_as_HTML; } } $end = time(); $parse = $end - $start; $parse = 1 unless($parse); $lps = int($total/$parse); print "$total lines in $parse seconds ($lps lines/sec)\n"; print LOG "$total lines in $parse seconds\nFinished at $end\nDone\n"; close (LOG); print "clumping files... \n"; system "cat *.html > masterfile.htm"; print "Done!\n";
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Adding 'referer' info to spider script
by valdez (Monsignor) on Aug 08, 2003 at 17:38 UTC | |
|
Re: Adding 'referer' info to spider script
by swiftone (Curate) on Aug 08, 2003 at 18:27 UTC |