1: #!/usr/local/bin/perl 
   2:                                                                  
   3: use LWP::Simple;
   4: $page = "http://www.COMPANY_HOMEPAGE.com";
   5: &get_urls;
   6: 
   7: ##fetches and parses pages
   8: foreach $url(@urls){
   9:         $visit = join(' ', @visit);
  10:         $visit =~ tr/\?/Q/;
  11:         if ($visit !~ /($url)/i){
  12: 
  13:                 open (OUT, ">>LOG.borders");
  14:                 open (VISIT, ">>LOG.visited.borders");
  15:                 open (LOG, ">>LOG.urls.borders");
  16: 
  17:                 $url =~ tr/Q/\?/;
  18:                 push(@visit, $url);
  19:                 print VISIT "$url \n";
  20:                 $page = $url;
  21:                 $print = get "$url";
  22:                 print "Getting $url...\n";
  23:                 &get_urls;
  24:                 foreach $pattern (""THING A", ""THING B", "THING C", "THING D"){
  25:                         if ($print =~ /($pattern)/i){
  26:                                 print OUT "$1, $url\n";
  27:                         };
  28:                 };
  29:         };
  30: 
  31: close (LOG);
  32: close (VIST);
  33: close (OUT);
  34: 
  35: };
  36: print "\nDone!!!\n";
  37: 
  38: sub get_urls{
  39: ##find all links within page
  40:         $doc = get "$page";
  41:         @doc = split(/\s/, $doc);
  42:         foreach $a (@doc){
  43:                 if ($a =~ /href="(http:\/\/[^"]+)">/i){
  44: 			#I needed the script to skip certain URLs 
  45: 			#(to avoid unproductive spydering, among 
  46: 			#other things.)  The following hunklet of 
  47: 			#code keeps an eye out for these.
  48:                         if ($1 !~ /BadThing1|BadThing2|BadThing3|#/i){
  49:                                 $foo = join(' ', @urls);
  50:                                 $moo = "$1";
  51:                                 $moo =~ tr/\?/Q/;
  52:                                 $foo =~ tr/\?/Q/;
  53:                                 if ($foo !~ /($1)/i){
  54:                                         push(@urls, $moo);
  55:                                         print LOG "$moo\n";
  56:                                 };
  57:                         };
  58:                 };
  59:         };
  60: };

Replies are listed 'Best First'.
RE: My Little Time-Saving Spyder
by kryten (Scribe) on May 11, 2000 at 20:01 UTC