0: #!/usr/local/bin/perl
1:
2: use LWP::Simple;
3: $page = "http://www.COMPANY_HOMEPAGE.com";
4: &get_urls;
5:
6: ##fetches and parses pages
7: foreach $url(@urls){
8: $visit = join(' ', @visit);
9: $visit =~ tr/\?/Q/;
10: if ($visit !~ /($url)/i){
11:
12: open (OUT, ">>LOG.borders");
13: open (VISIT, ">>LOG.visited.borders");
14: open (LOG, ">>LOG.urls.borders");
15:
16: $url =~ tr/Q/\?/;
17: push(@visit, $url);
18: print VISIT "$url \n";
19: $page = $url;
20: $print = get "$url";
21: print "Getting $url...\n";
22: &get_urls;
23: foreach $pattern (""THING A", ""THING B", "THING C", "THING D"){
24: if ($print =~ /($pattern)/i){
25: print OUT "$1, $url\n";
26: };
27: };
28: };
29:
30: close (LOG);
31: close (VIST);
32: close (OUT);
33:
34: };
35: print "\nDone!!!\n";
36:
37: sub get_urls{
38: ##find all links within page
39: $doc = get "$page";
40: @doc = split(/\s/, $doc);
41: foreach $a (@doc){
42: if ($a =~ /href="(http:\/\/[^"]+)">/i){
43: #I needed the script to skip certain URLs
44: #(to avoid unproductive spydering, among
45: #other things.) The following hunklet of
46: #code keeps an eye out for these.
47: if ($1 !~ /BadThing1|BadThing2|BadThing3|#/i){
48: $foo = join(' ', @urls);
49: $moo = "$1";
50: $moo =~ tr/\?/Q/;
51: $foo =~ tr/\?/Q/;
52: if ($foo !~ /($1)/i){
53: push(@urls, $moo);
54: print LOG "$moo\n";
55: };
56: };
57: };
58: };
59: }; In reply to My Little Time-Saving Spyder by mcwee
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |