# NoXML.pm
# NoXML [pronounced "no xml"] is a dire-need drop-in
# replacement for SOAP::Lite designed for Google Web API hacking.
package NoXML;
use strict;
no strict "refs";
# LWP for making HTTP requests, XML for parsing Google SOAP
use LWP::UserAgent;
use XML::Simple;
# Create a new NoXML
sub new {
my $self = {};
bless($self);
return $self;
}
# Replacement for the SOAP::Lite-based doGoogleSearch method
sub doGoogleSearch {
my($self, %args);
($self, @args{qw/ key q start maxResults filter restrict
safeSearch lr ie oe /}) = @_;
# grab SOAP request from _ _DATA_ _
my $tell = tell(DATA);
my $soap_request = join '', ;
seek(DATA, $tell, 0);
$soap_request =~ s/\$(\w+)/$args{$1}/ge; #interpolate variables
# Make (POST) a SOAP-based request to Google
my $ua = LWP::UserAgent->new;
my $req = HTTP::Request->new(POST => 'http://api.google.com/search/beta2');
$req->content_type('text/xml');
my $leng = length($soap_request);
$req->content_length($leng);
$req->content($soap_request);
my $res = $ua->request($req);
my $soap_response = $res->as_string;
# Drop the HTTP headers and so forth until the initial xml element
$soap_response =~ s/^.+?(<\?xml)/$1/migs;
# Drop element namespaces for tolerance of future prefix changes
$soap_response =~ s!(<\/?)[\w-]+?:([\w-]+?)!$1$2!g;
# Set up a return dataset
my $return;
# Unescape escaped HTML in the resultset
my %unescape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"', '''=>"'");
my $unescape_re = join '|' => keys %unescape;
# Divide the SOAP response into the results and other metadata
my($before, $results, $after) = $soap_response =~
m#(^.+)(.+?)(.+$)#migs ;
my $before_and_after = $before . $after;
# Glean as much metadata as possible (while being somewhat lazy ;-)
while ($before_and_after =~ m#([^<]*?)<#migs) {
$return->{$1} = $3; # pack the metadata into the return dataset
}
# Glean the results
my @results;
while ($results =~ m#(.+?)#migs) {
my $item = $1;
my $pairs = {};
while ( $item =~ m#([^<]*)#migs ) {
my($element, $value) = ($1, $2);
$value =~ s/($unescape_re)/$unescape{$1}/g;
$pairs->{$element} = $value;
}
push @results, $pairs;
}
# Pack the results into the return dataset
$return->{resultElements} = \@results;
# Return nice, clean, usable results
return $return;
}
1;
# This is the SOAP message template sent to api.google.com. Variables
# signified with $variablename are replaced by the values of their
# counterparts sent to the doGoogleSearch subroutine.
__DATA__
$key
$q
$start
$maxResults
$filter
$restrict
$safeSearch
$lr
$ie
$oe
####
#!/usr/bin/perl
# noxml_google2csv.pl
# Google Web Search Results via NoXML ("no xml") module
# exported to CSV suitable for import into Excel
# Usage: noxml_google2csv.pl "{query}" [> results.csv]
# Your Google API developer's key
my $google_key='insertyourkeyrighthere';
use strict;
use NoXML;
use Data::Dumper;
$ARGV[0] or die qq{usage: perl noxml_search2csv.pl "{query}"\n};
my $google_search = new NoXML;
my $results = $google_search ->
doGoogleSearch(
$google_key, shift @ARGV, 0, 10, "false",
"", "false", "", "latin1", "latin1"
);
@{$results->{'resultElements'}} or die('No results');
print Dumper(\$results);
print qq{"title","url","snippet"\n};
foreach (@{$results->{'resultElements'}}) {
$_->{title} =~ s!"!""!g; # double escape marks
$_->{snippet} =~ s!"!""!g;
my $output = qq{"$_->{title}","$_->{URL}","$_->{snippet}"\n};
$output =~ s!<.+?>!!g; # drop all html tags
print $output;
}