# NoXML.pm # NoXML [pronounced "no xml"] is a dire-need drop-in # replacement for SOAP::Lite designed for Google Web API hacking. package NoXML; use strict; no strict "refs"; # LWP for making HTTP requests, XML for parsing Google SOAP use LWP::UserAgent; use XML::Simple; # Create a new NoXML sub new { my $self = {}; bless($self); return $self; } # Replacement for the SOAP::Lite-based doGoogleSearch method sub doGoogleSearch { my($self, %args); ($self, @args{qw/ key q start maxResults filter restrict safeSearch lr ie oe /}) = @_; # grab SOAP request from _ _DATA_ _ my $tell = tell(DATA); my $soap_request = join '', ; seek(DATA, $tell, 0); $soap_request =~ s/\$(\w+)/$args{$1}/ge; #interpolate variables # Make (POST) a SOAP-based request to Google my $ua = LWP::UserAgent->new; my $req = HTTP::Request->new(POST => 'http://api.google.com/search/b +eta2'); $req->content_type('text/xml'); my $leng = length($soap_request); $req->content_length($leng); $req->content($soap_request); my $res = $ua->request($req); my $soap_response = $res->as_string; # Drop the HTTP headers and so forth until the initial xml element $soap_response =~ s/^.+?(<\?xml)/$1/migs; # Drop element namespaces for tolerance of future prefix changes $soap_response =~ s!(<\/?)[\w-]+?:([\w-]+?)!$1$2!g; # Set up a return dataset my $return; # Unescape escaped HTML in the resultset my %unescape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"', '''=>"' +"); my $unescape_re = join '|' => keys %unescape; # Divide the SOAP response into the results and other metadata my($before, $results, $after) = $soap_response =~ m#(^.+)(.+?)(.+$)#migs ; my $before_and_after = $before . $after; # Glean as much metadata as possible (while being somewhat lazy ;-) while ($before_and_after =~ m#([^<]*?)<#migs) { $return->{$1} = $3; # pack the metadata into the return dataset } # Glean the results my @results; while ($results =~ m#(.+?)#migs) { my $item = $1; my $pairs = {}; while ( $item =~ m#([^<]*)#migs ) { my($element, $value) = ($1, $2); $value =~ s/($unescape_re)/$unescape{$1}/g; $pairs->{$element} = $value; } push @results, $pairs; } # Pack the results into the return dataset $return->{resultElements} = \@results; # Return nice, clean, usable results return $return; } 1; # This is the SOAP message template sent to api.google.com. Variables # signified with $variablename are replaced by the values of their # counterparts sent to the doGoogleSearch subroutine. __DATA__ <?xml version='1.0' encoding='UTF-8'?> <SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance" xmlns:xsd="http://www.w3.org/1999/XMLSchema"> <SOAP-ENV:Body> <ns1:doGoogleSearch xmlns:ns1="urn:GoogleSearch" SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/" +> <key xsi:type="xsd:string">$key</key> <q xsi:type="xsd:string">$q</q> <start xsi:type="xsd:int">$start</start> <maxResults xsi:type="xsd:int">$maxResults</maxResults> <filter xsi:type="xsd:boolean">$filter</filter> <restrict xsi:type="xsd:string">$restrict</restrict> <safeSearch xsi:type="xsd:boolean">$safeSearch</safeSearch> <lr xsi:type="xsd:string">$lr</lr> <ie xsi:type="xsd:string">$ie</ie> <oe xsi:type="xsd:string">$oe</oe> </ns1:doGoogleSearch> </SOAP-ENV:Body> </SOAP-ENV:Envelope>
#!/usr/bin/perl # noxml_google2csv.pl # Google Web Search Results via NoXML ("no xml") module # exported to CSV suitable for import into Excel # Usage: noxml_google2csv.pl "{query}" [> results.csv] # Your Google API developer's key my $google_key='insertyourkeyrighthere'; use strict; use NoXML; use Data::Dumper; $ARGV[0] or die qq{usage: perl noxml_search2csv.pl "{query}"\n}; my $google_search = new NoXML; my $results = $google_search -> doGoogleSearch( $google_key, shift @ARGV, 0, 10, "false", "", "false", "", "latin1", "latin1" ); @{$results->{'resultElements'}} or die('No results'); print Dumper(\$results); print qq{"title","url","snippet"\n}; foreach (@{$results->{'resultElements'}}) { $_->{title} =~ s!"!""!g; # double escape marks $_->{snippet} =~ s!"!""!g; my $output = qq{"$_->{title}","$_->{URL}","$_->{snippet}"\n}; $output =~ s!<.+?>!!g; # drop all html tags print $output; }
Edit (holli): Added readmore tags
In reply to NoXML Google Search API - Premature end of file by inblosam
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |