1: #!/usr/local/bin/perl
2:
3: use LWP::Simple;
4: $page = "http://www.COMPANY_HOMEPAGE.com";
5: &get_urls;
6:
7: ##fetches and parses pages
8: foreach $url(@urls){
9: $visit = join(' ', @visit);
10: $visit =~ tr/\?/Q/;
11: if ($visit !~ /($url)/i){
12:
13: open (OUT, ">>LOG.borders");
14: open (VISIT, ">>LOG.visited.borders");
15: open (LOG, ">>LOG.urls.borders");
16:
17: $url =~ tr/Q/\?/;
18: push(@visit, $url);
19: print VISIT "$url \n";
20: $page = $url;
21: $print = get "$url";
22: print "Getting $url...\n";
23: &get_urls;
24: foreach $pattern (""THING A", ""THING B", "THING C", "THING D"){
25: if ($print =~ /($pattern)/i){
26: print OUT "$1, $url\n";
27: };
28: };
29: };
30:
31: close (LOG);
32: close (VIST);
33: close (OUT);
34:
35: };
36: print "\nDone!!!\n";
37:
38: sub get_urls{
39: ##find all links within page
40: $doc = get "$page";
41: @doc = split(/\s/, $doc);
42: foreach $a (@doc){
43: if ($a =~ /href="(http:\/\/[^"]+)">/i){
44: #I needed the script to skip certain URLs
45: #(to avoid unproductive spydering, among
46: #other things.) The following hunklet of
47: #code keeps an eye out for these.
48: if ($1 !~ /BadThing1|BadThing2|BadThing3|#/i){
49: $foo = join(' ', @urls);
50: $moo = "$1";
51: $moo =~ tr/\?/Q/;
52: $foo =~ tr/\?/Q/;
53: if ($foo !~ /($1)/i){
54: push(@urls, $moo);
55: print LOG "$moo\n";
56: };
57: };
58: };
59: };
60: };