in reply to Re^2: HTTP:Async weirdness
in thread HTTP:Async weirdness
I changed the code to use sqlite (and fixed a few problems on the way such as that I could serve you SQL if I knew you're crawling my site :)) and it works just fine here. What do you mean by "jumbled" contents anyway?
Here's what works for me:
#!/usr/bin/perl use MIME::Base64; use Encode qw(encode); use DBI; use DBI qw(:sql_types); require HTTP::Request; require HTTP::Response; use HTTP::Async; #HTTP::Async timout is broken by default. Check the cpan page for how + to fix. It's in the bugs page. my $async = HTTP::Async->new(timeout=>60,slots=>100); # I'm on a ter +ribly slow line use List::MoreUtils; use strict; open PIDFILE, ">$ENV{HOME}/pidfile" or die $!; # will run as user print PIDFILE $$; close PIDFILE; #definition of variables my $db="databasename"; my $host="localhost"; my $user="username"; my $password="password"; my $verbose_logging = -1; while (1 == 1) { sleep rand 3; my @sites; my @db_row; my @response_array; my $dbh = DBI->connect ("DBI:SQLite:$db",'','') or die "Can't connect to database: $DBI::errstr\n"; my $sth = $dbh->prepare( "SELECT `page_url`, `url2`, `idx`, `date_ +time`, `userid` FROM `queue` LIMIT 100"); $sth->execute( ); while ( my @row = $sth->fetchrow_array ) { push @sites, $row[0]; push @db_row, [$row[0], $row[1], $row[2], $row[3], $row[4]]; } foreach my $site(@sites) { $async->add( HTTP::Request->new( GET => $site ) ); } while ( $async->not_empty ) { if ( my ($response, $id) = $async->wait_for_next_response ) { print $async->info; my $content = $response->decoded_content; my $result_row = $id - 1; my $urlcount = 0; $urlcount++ while ($content =~ m/$db_row[$result_row][1]/g +i); $content = encode_base64(encode("UTF-8", $content)); my $QueueExecute = $dbh->prepare( "INSERT INTO cheker (`active`,`page_url`,`url2`,`date_ +time`,`userid`,`html_source`) VALUES (?,?,?,?,?,?);" ); $QueueExecute->execute($urlcount,$db_row[$result_row][0],$ +db_row[$result_row][1], $db_row[$result_row][3],$db_row[$result_row][4],$conte +nt ); warn "Problem in retrieving results", $sth->errstr( ), "\n +" if $QueueExecute->err; print "Inserted record into checker\n" if ($verbose_loggin +g >= 0); my $QueueExecute = $dbh->prepare("DELETE FROM queue WHERE +`idx` = ?"); $QueueExecute->execute($db_row[$result_row][2]); print "Deleted row from queue\n" if ($verbose_logging >= 0 +); warn "Problem in retrieving results", $sth->errstr, "\n" i +f $QueueExecute->err; warn "Problem in retrieving results", $sth->errstr, "\n" i +f $sth->err; } else { next; } } }
SQLite stuff:
$ sqlite3 databasename CREATE TABLE cheker (active tinyint, page_url varchar, url2 varchar, d +ate_time datetime, userid integer, html_source text); CREATE TABLE queue(page_url varchar, url2 varchar, idx integer, date_t +ime datetime, userid integer); insert into queue values ("http://google.com/","",1,date(),1); insert into queue values ("http://gmx.com/","",1,date(),1); insert into queue values ("http://twitter.com/","",1,date(),1);
To test:
sqlite3 -list databasename "select html_source from cheker"|base64 -d|less
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^4: HTTP:Async weirdness
by schnibitz (Novice) on Jan 22, 2012 at 14:54 UTC | |
|
Re^4: HTTP:Async weirdness
by schnibitz (Novice) on Jan 22, 2012 at 15:59 UTC | |
|
Re^4: HTTP:Async weirdness
by schnibitz (Novice) on Jan 22, 2012 at 22:34 UTC |