Take a look at the HTML Parser modules on CPAN. In the past I have successfully used HTML::Parser to extract data, links, etc. It is actually very easy to use once you understand it enough. Below I have included a simple module that will return all text snippets found within a document. This would not be hard to modify to permit using the POST method. As for your javascript problem, I am not sure what you mean by "javascript which has to be parsed through, in order to reach the final page".
package ExtractText;
use strict;
use Exporter ();
use HTML::Parser;
use LWP::UserAgent;
use Carp qw( croak );
our ( @ISA ) = qw( Exporter );
our ( @EXPORT_OK ) = qw( extract_text );
sub extract_text {
shift( @_ ) if ( $_[0] eq 'ExtractText' );
my $uri = shift;
die( "Single parameter to extract_links() must be a URI to process
+" )
unless ( defined( $uri ) );
my $ua = LWP::UserAgent->new();
my $res = $ua->get( $uri );
croak( "Fetch of '$uri' failed: ", $res->status_line() )
unless( $res->code() == 200 );
my $parser = HTML::Parser->new(
text_h => [ \&_parser_text, 'self,dtext,is_cdata' ]
);
$parser->parse( $res->content() );
return( @{ $parser->{_extracted} } );
}
sub _parser_text {
my ($self, $dtext, $is_cdata) = @_;
push( @{ $self->{_extracted} }, $dtext )
unless ( $is_cdata );
}
1;
Example usage:
#!/usr/bin/perl -w
use strict;
use ExtractText qw( extract_text );
my @text_snippets = extract_text(
'http://www.perlmonks.org'
);
print @text_snippets;
|