mkurtis has asked for the wisdom of the Perl Monks concerning the following question:
For some reason this allows duplicates, any ideas? Thanks#!/usr/bin/perl -w use strict; use warnings; use LWP::RobotUA; use HTML::SimpleLinkExtor; use Storable; use HTML::Parser 3; use vars qw/$http_ua $link_extractor/; my %visited; my $visited; sub crawl { #$visited = retrieve('/var/www/data/links'); #%visited = %{$visited}; my @queue = @_; my $a = 0; my $base; while ( my $url = shift @queue ) { open(LINKS,"</var/www/data/links.txt"); my @visited = grep /\b$url\S\b/, <LINKS>; next if defined $visited[0]; close(LINKS); my $response = $http_ua->get($url); my $html = $response->content; open FILE, '>' . ++$a . '.txt'; print FILE "$url\n"; print FILE $html; #print FILE body_text($html); close FILE; print qq{Downloaded: "$url"\n}; push @queue, do { my $link_extractor = HTML::SimpleLinkExtor->new($u +rl); $link_extractor->parse($html); $link_extractor->a; }; open(LINKS,">>/var/www/data/links.txt"); print LINKS $url . "\n"; close(LINKS); @visited = undef; #$visited{$url} = 1; } #store \%visited, '/var/www/data/links'; #%visited = undef; } $http_ua = new LWP::RobotUA theusefulbot => 'bot@theusefulnet.com'; $http_ua->delay( 10 / 6000 ); crawl(@ARGV); sub body_text { my $content = $_[0] || return 'EMPTY BODY'; # HTML::Parser is broken on Javascript and styles # (well it leaves it in the text) so we 'fix' it.... my $p = HTML::Parser->new( start_h => [ sub{ $_[0]->{text}.=' '; $_[0]->{skip}++ if $_[1] + eq 'script' or $_[1] eq 'style'; } , 'self,tag' ], end_h => [ sub{ $_[0]->{skip}-- if $_[1] eq '/script' or $_[ +1] eq '/style'; } , 'self,tag' ], text_h => [ sub{ $_[0]->{text}.=$_[1] unless $_[0]->{skip}}, +'self,dtext' ] )->parse($content); $p->eof(); my $text = $p->{text}; # remove escapes $text =~ s/ / /gi; $text =~ s/&[^;]+;/ /g; # remove non ASCII printable chars, leaves punctuation stuff $text =~ s/[^\040-\177]+/ /g; # remove any < or > in case parser choked - rare but happens $text =~ s/[<>]/ /g; # crunch whitespace $text =~ s/\s{2,}/ /g; $text =~ s/^\s+//g; return $text; }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Using text files to remove duplicates in a web crawler
by PodMaster (Abbot) on Jul 07, 2004 at 03:56 UTC | |
|
Re: Using text files to remove duplicates in a web crawler
by davido (Cardinal) on Jul 07, 2004 at 04:43 UTC | |
by matija (Priest) on Jul 07, 2004 at 06:28 UTC | |
by Scarborough (Hermit) on Jul 07, 2004 at 15:51 UTC | |
|
Re: Using text files to remove duplicates in a web crawler
by Stevie-O (Friar) on Jul 07, 2004 at 07:40 UTC |