use strict; use warnings; use LWP::Simple; use LWP::UserAgent; use HTML::TokeParser::Simple; my $ua = LWP::UserAgent->new( ); my $text = do_GET_TXT("http://www.spacex.com/webcast"); print $text; sub do_GET_TXT { my ($url)=@_; print "Downloading and reading HTML $url...\n"; my $response = $ua->get($url); if ($response->is_error) { $response->code; } else{ my $HTML = $response->decoded_content(); my @text; require HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new(\$HTML); while ( my $token = $p->get_token ) { next unless $token->is_text; my $out = $token->as_is; $out =~ s/^\s+/\n/; push (@text, $out); } my $text = join("", @text); #some heuristics $text =~ s/\n+/\n/g; $text =~ s/\n(\d+\.)\n/$1\t/g; $text =~ s/(\(\d+\))\n/$1\t/g; $text =~ s/([a-z]\))\n/$1\t/g; return $text; } }