#!/usr/bin/perl -w use strict; use LWP::Simple; use HTML::TokeParser; use HTML::Entities; # @newspages are pages I don't really wanna read, but I'd rather just have the links my @newspages = qw( http://www.surfstation.lu/00_news.asp http://www.cubadust.com/news.htm http://www.caffemocha.com/cgi-bin/index.htm http://www.halfproject.com/news.php http://www.reinvent.co.nz/v2/skins/news2002.asp ); my $body = < Silent11 helps out END_HTML for (@newspages) { my $html = $_; my ($junk,$short) = split(/\./,$html); # get domain name $body .= "" } $body .= "
$short
"; my $get = get("$html"); my $p = HTML::TokeParser->new(\$get); while (my $token = $p->get_tag("a")) { my $url = $token->[1]{href} || "-"; my $text = $p->get_trimmed_text("/a"); unless ($url =~ /^mailto|^javascript/){ # don't grab javascript or mailto's (not perfect) $body .= "$text
\n"; } } $body .= "
"; open(OUT,">news.file.html"); # send to an html file print OUT "$body";