UPDATE: NEW WORKING CODE Thanks to Kappa for making it check itself against a visited list.#!/usr/bin/perl -w use strict; use diagnostics; use LWP::RobotUA; use URI::URL; #use HTML::Parser (); use HTML::SimpleLinkExtor; my $a=0; my $i; my $links; my $base; my $u; for($u=1; $u<1000000000; $u++) { open(FILE1,"</var/www/links/file$u.txt"); while(<FILE1>) { my $ua = LWP::RobotUA->new('theusefulbot', 'bot@theusefulnet.com'); #my $p = HTML::Parser->new(); $ua->delay(10/600); my $content = $ua->get($_)->content; #my $text = $p->parse($content)->parse; open(OUTPUT,">/var/www/data/$a.txt"); print OUTPUT "$content"; close(OUTPUT); my $extor = HTML::SimpleLinkExtor->new($base); $extor->parse($content); my @links = $extor->a; $u++; open(FILE2,">/var/www/links/file$u.txt"); foreach $links(@links) { print FILE2 url("$links")->abs("$_"); print FILE2 "\n"; } $a++; $i=$a; $u--; } close(FILE1); close(FILE2); }
#!/usr/bin/perl -w use strict; use LWP::RobotUA; use HTML::SimpleLinkExtor; use URI::URL; use vars qw/$http_ua $link_extractor/; sub crawl { my @queue = @_; my %visited; my $a = 0; my $base; while ( my $url = shift @queue ) { next if $visited{$url}; my $content = $http_ua->get($url)->content; open FILE, '>' . ++$a . '.txt'; print FILE "$url\n"; print FILE $content; close FILE; print qq{Downloaded: "$url"\n}; push @queue, do { my $link_extractor = HTML::SimpleLinkExtor->new($url); $link_extractor->parse($content); $link_extractor->a; }; $visited{$url} = 1; } } $http_ua = new LWP::RobotUA theusefulbot => 'bot@theusefulnet.com'; $http_ua->delay( 10 / 6000 ); crawl(@ARGV);
In reply to Web Crawler by mkurtis
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |