package ExtractText; use strict; use Exporter (); use HTML::Parser; use LWP::UserAgent; use Carp qw( croak ); our ( @ISA ) = qw( Exporter ); our ( @EXPORT_OK ) = qw( extract_text ); sub extract_text { shift( @_ ) if ( $_[0] eq 'ExtractText' ); my $uri = shift; die( "Single parameter to extract_links() must be a URI to process" ) unless ( defined( $uri ) ); my $ua = LWP::UserAgent->new(); my $res = $ua->get( $uri ); croak( "Fetch of '$uri' failed: ", $res->status_line() ) unless( $res->code() == 200 ); my $parser = HTML::Parser->new( text_h => [ \&_parser_text, 'self,dtext,is_cdata' ] ); $parser->parse( $res->content() ); return( @{ $parser->{_extracted} } ); } sub _parser_text { my ($self, $dtext, $is_cdata) = @_; push( @{ $self->{_extracted} }, $dtext ) unless ( $is_cdata ); } 1;