For some reason this allows duplicates, any ideas? Thanks#!/usr/bin/perl -w use strict; use warnings; use LWP::RobotUA; use HTML::SimpleLinkExtor; use Storable; use HTML::Parser 3; use vars qw/$http_ua $link_extractor/; my %visited; my $visited; sub crawl { #$visited = retrieve('/var/www/data/links'); #%visited = %{$visited}; my @queue = @_; my $a = 0; my $base; while ( my $url = shift @queue ) { open(LINKS,"</var/www/data/links.txt"); my @visited = grep /\b$url\S\b/, <LINKS>; next if defined $visited[0]; close(LINKS); my $response = $http_ua->get($url); my $html = $response->content; open FILE, '>' . ++$a . '.txt'; print FILE "$url\n"; print FILE $html; #print FILE body_text($html); close FILE; print qq{Downloaded: "$url"\n}; push @queue, do { my $link_extractor = HTML::SimpleLinkExtor->new($u +rl); $link_extractor->parse($html); $link_extractor->a; }; open(LINKS,">>/var/www/data/links.txt"); print LINKS $url . "\n"; close(LINKS); @visited = undef; #$visited{$url} = 1; } #store \%visited, '/var/www/data/links'; #%visited = undef; } $http_ua = new LWP::RobotUA theusefulbot => 'bot@theusefulnet.com'; $http_ua->delay( 10 / 6000 ); crawl(@ARGV); sub body_text { my $content = $_[0] || return 'EMPTY BODY'; # HTML::Parser is broken on Javascript and styles # (well it leaves it in the text) so we 'fix' it.... my $p = HTML::Parser->new( start_h => [ sub{ $_[0]->{text}.=' '; $_[0]->{skip}++ if $_[1] + eq 'script' or $_[1] eq 'style'; } , 'self,tag' ], end_h => [ sub{ $_[0]->{skip}-- if $_[1] eq '/script' or $_[ +1] eq '/style'; } , 'self,tag' ], text_h => [ sub{ $_[0]->{text}.=$_[1] unless $_[0]->{skip}}, +'self,dtext' ] )->parse($content); $p->eof(); my $text = $p->{text}; # remove escapes $text =~ s/ / /gi; $text =~ s/&[^;]+;/ /g; # remove non ASCII printable chars, leaves punctuation stuff $text =~ s/[^\040-\177]+/ /g; # remove any < or > in case parser choked - rare but happens $text =~ s/[<>]/ /g; # crunch whitespace $text =~ s/\s{2,}/ /g; $text =~ s/^\s+//g; return $text; }
In reply to Using text files to remove duplicates in a web crawler by mkurtis
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |