#!/usr/local/bin/perl use LWP::Simple; $page = "http://www.COMPANY_HOMEPAGE.com"; &get_urls; ##fetches and parses pages foreach $url(@urls){ $visit = join(' ', @visit); $visit =~ tr/\?/Q/; if ($visit !~ /($url)/i){ open (OUT, ">>LOG.borders"); open (VISIT, ">>LOG.visited.borders"); open (LOG, ">>LOG.urls.borders"); $url =~ tr/Q/\?/; push(@visit, $url); print VISIT "$url \n"; $page = $url; $print = get "$url"; print "Getting $url...\n"; &get_urls; foreach $pattern (""THING A", ""THING B", "THING C", "THING D"){ if ($print =~ /($pattern)/i){ print OUT "$1, $url\n"; }; }; }; close (LOG); close (VIST); close (OUT); }; print "\nDone!!!\n"; sub get_urls{ ##find all links within page $doc = get "$page"; @doc = split(/\s/, $doc); foreach $a (@doc){ if ($a =~ /href="(http:\/\/[^"]+)">/i){ #I needed the script to skip certain URLs #(to avoid unproductive spydering, among #other things.) The following hunklet of #code keeps an eye out for these. if ($1 !~ /BadThing1|BadThing2|BadThing3|#/i){ $foo = join(' ', @urls); $moo = "$1"; $moo =~ tr/\?/Q/; $foo =~ tr/\?/Q/; if ($foo !~ /($1)/i){ push(@urls, $moo); print LOG "$moo\n"; }; }; }; }; };