#!/usr/bin/env perl # # Name: TestFetch.pl # # Requires Internet access # use strict; use warnings; use LWP::Simple; use HTML::Parser; # Global Variables my $debug = 1; package MyParser; my $sitetrigger = 0; my $lastmodtrigger = 0; my $tofile = ""; my $pos = -1; use base qw(HTML::Parser); sub start { my ($self, $tagname, $attr, $attrseq, $origtext) = @_; $sitetrigger = 0; $lastmodtrigger = 0; if((index $tagname, "loc") ne -1) { $sitetrigger = 1; } if((index $tagname, "lastmod") ne -1) { $lastmodtrigger = 1; } if($debug == 1) { print "------------START-----------\n"; print "tagname: $tagname\n"; } } sub text { my ($self, $text) = @_; my $filename = ""; if($sitetrigger == 1) { $filename = ""; $pos = rindex($text, '/', ); if($pos ne -1) { $filename = substr($text, ($pos + 1)); } print "fetching: $text into $filename\n"; LWP::Simple->getstore ($text, $filename); sleep(6); } if($debug == 1) { print "------------TEXT-----------\n"; print "sitetrigger: $sitetrigger\n"; print "lastmodtrigger: $lastmodtrigger\n"; print "filename: $filename\n"; print "text: $text\n"; } } sub end { my($self, $end, $origtext) = @_; if($debug == 1) { print "------------END-----------\n"; print "end: $end\n"; } } package main; my $htmlparse = new MyParser; my $loc = ""; my $siteurl; my $filefound = 0; my $pos1 = -1; my $content = ""; my $url = $ARGV[0]; $loc = $url . '/robots.txt'; if($loc ne "") { if ($debug == 1) { print "loc: $loc\n"; } getstore($loc, 'robots.txt') or die "Couldn't get robots.txt"; open IN, 'robots.txt' or die $!; while () { $pos1 = index (uc $_, 'SITEMAP'); if($pos1 ne -1) { $siteurl = substr($_, ($pos1 + 8)); if ($debug == 1) { print "siteurl: $siteurl\n"; } $content = get($siteurl); $filefound = 1; last; } } close IN or die "IN: $!"; if($filefound == 1) { $htmlparse->parse($content); } }