Many thanks for the feedback guys, I'd really appreciate some help with this - the script works fine apart from the closed filehandle error. As mentioned before, I'm assigning the current session name as the filehandle for printing, which I thought would be unique but I might have misunderstood the POE code. Here it is:
#!/usr/bin/perl
use warnings;
use utf8;
#use strict;
use LWP::UserAgent;
use HTTP::Request::Common qw(GET POST);
use POE qw(Component::Client::HTTP);
#capture the search term
print "Welcome, please enter your search term: \n";
chomp (my $term = <>);
my @url_list=();
if ($term){
my $browser = LWP::UserAgent->new;
$browser->agent('Mozilla/6.0');
#post the search query to Scroogle
my $response = $browser->post(
'http://www.scroogle.org/cgi-bin/nbbw.cgi',
['Gw'=>$term, 'n'=>'1', 'l'=>'en']); # set the volume and language of
+the returned pages
#build a list of returned links
if($response->is_success){
$response = $response->decoded_content;
@url_list = ($response =~ / href="(http:.*?)"/g);
%urls=();
my $count =1;
foreach (@url_list){
$urls{ $_ } = $count;
$count++;
}
}
}
#Setup the virtual browser and its configuration
POE::Component::Client::HTTP->spawn(
Alias => 'Mozilla', #alias of the url request to the web server
MaxSize => 300000, # Set the maximum HTML page size you want to r
+etrieve.
Timeout => 20, #this is the number of seconds to wait before canc
+elling retrieval
);
# Create a session for every URL taken from the search.
foreach my $url (@url_list) {
POE::Session->create(
inline_states => {
_start => sub {
my ($kernel, $heap) = @_[KERNEL, HEAP];
# Post a request to the HTTP user agent component. When the
# component has an answer (positive or negative), it will
# send back a "got_response" event with an HTTP::Response
# object.
$kernel->post(Mozilla => request => got_response => GET $url )
+;
},
# A response has arrived. Display it.
got_response => sub {
my $id = $_[SESSION]->ID;
my $filehandle = $id;
my ($heap, $request_packet, $response_packet) = @_[HEAP, ARG0,
+ ARG1];
# The original HTTP::Request object. If several requests
# were made, this can help match the response back to its
# request.
my $http_request = $request_packet->[0];
# The HTTP::Response object.
my $http_response = $response_packet->[0];
# Make the response presentable, and display it.
my $response_string = $http_response->decoded_content();
$response_string=~ s/[[:^ascii:]]/ /g; #removes non ASCII char
+acters
$response_string =~ /<title>(.*?)</; #captures the webpage tit
+le
my $rank = $urls{$url};
if ($1){
my $title = $1;
$response_string =~ s/<(?:[^>'"]*|(['"]).*?\1)*>//gs; #removes
+ the HTML
#$response_string =~s/[^a-zA-Z]|\D|[^\.'\-\$]/ /g; #replaces n
+on character text with whitespace
my @paragraphs = split /\n/, $response_string; #this splits up
+ every paragraph in the remaining text
my @results;
foreach (@paragraphs){
if (/ $term /){
unshift(@results, $_);
}
}
if (@results){
print "Content for $term found at $url\n";
open ($filehandle,">>:utf8", "c:/perl/results/$title.txt"); #c
+reates a new file
print $filehandle "$url\n$rank\n"; #prints URL and search eng
+ine ranking to the file
foreach (@results){
print $filehandle "$_\n";
}
close $filehandle;
}
else{
print "No relevant text found at $url\n";
}
}
},
},
);
}
# Run everything, and exit when it's all done.
$poe_kernel->run();
exit 0;
|