$html =~ s/something/something else/sgi; print "your html is currently:\n\n$html"; #### #!/usr/bin/perl -w use diagnostics; use CGI::Carp qw(fatalsToBrowser); use LWP::Simple; @pages = qw( page1 http://www.page1.com/ page2 http://www.page2.com/ ); @keepers = qw(b blockquote br i li ol p ul); # proceed through the array of site 2 by 2 # using the name and URL for($i=0;$i<$pagelength;$i+=2){ $html = ""; # initialise variable $pagename = $pages[$i]; $pageurl = $pages[$i+1]; print "accessing $pagename at $pageurl...
\n"; #this is a bit cargo-cult, I got it from someone else's use of LWP:Simple $doc=get($pageurl); @html = $doc; $html = join('',@html); ($pagetitle) = $html =~ /(.*)<\/TITLE>/sgi; $html =~ s/<TITLE>(.*)<\/TITLE>//sgi; #kill any script blocks $html =~ s/<script[^>]*>.*?<\/script>//sgi; #kill any style blocks $html =~ s/<style[^>]*>.*?<\/style>//sgi; #replace images with [image] $html=~s/<IMG[^>]+?(?:ALT="([^"]*)"[^>]*)?>/"[img".((defined $1)?":\"$1\"":"")."]"/sgei; #temporarily encode the tags we're keeping into {{{tag}}} instead of <tag> for($j=0;$j<$keeperlength;$j++){ my $tag = $keepers[$j]; $html =~ s/<($tag[^>]?)>/{{{$1}}}/sgi; $html =~ s/<\/($tag[^>]?)>/{{{\/$1}}}/sgi; } #remove any remaining html $html =~ s/<[^>]*>//sgi; # re-encode the temporarily encoded tags $html =~ s/\{{3}/</sgi; $html =~ s/\}{3}/>/sgi; #tighten up the code $html =~ s/\s+/ /g; #write out the file print "Writing out the new $pagename.html file...<BR>\n"; open (PAGEOUTPUT, ">/www/db/mysite/mydirectory/$pagename.html") || die "WTF? $!"; print PAGEOUTPUT "<HTML><HEAD>\n<TITLE>$pagename\n\n"; if(-e "/www/db/mysite/mydirectory/$pagename.gif"){ print PAGEOUTPUT "

"; } print PAGEOUTPUT "

$pagetitle


"; print PAGEOUTPUT $html; print PAGEOUTPUT ""; close (PAGEOUTPUT); print "Finished processing $pagename...


\n"; }