comment on

#!/usr/bin/perl -w

#------------------------------------------------------------#
# Scrape images.google.com for images matching a specific    #
# keyword.                                                   #
#------------------------------------------------------------#
# ./imgo.pl --query "perl monks"                             #
#------------------------------------------------------------#
use HTML::Parser;
use LWP::UserAgent;
use Parallel::ForkManager;
use Getopt::Long;
use URI::Escape;
use strict;

#------------------------------------------------------------#
# Options and other variables we'll need.                    #
#------------------------------------------------------------#

# Defaults
my %opt = (
   dir   => ".",
   safe  => "0",
   procs => "20",
   ua    => "Mozilla/1.0",
   query => "",
);

# Options from the commandline.
GetOptions( 
   'verbose' => \$opt{'verbose'},
   'help'    => \$opt{'help'   },
   'safe'    => \$opt{'safe'   },
   'query=s' => \$opt{'query'  },
   'procs=i' => \$opt{'procs'  },
   'ua=s'    => \$opt{'ua'     },
   'dir=s'   => \$opt{'dir'    },
);

# Compose our base URL for images.google.com.
$opt{'query'} = uri_escape($opt{'query'});

my $url = "http://images.google.com/images" . 
   "?q=$opt{'query'}" .
   "\&safe=" . ($opt{'safe'} ? "on" : "off");

# Initial image offset (Page 1 of results)
my $start = "0";

# Validate input and display help if needed.
&help if ($opt{'help'} || !$opt{'query'});

#------------------------------------------------------------#
# Create objects we'll need.                                 #
#------------------------------------------------------------#

# LWP for HTTP requests.
my $ua = new LWP::UserAgent;
$ua->agent($opt{'ua'}); # Google doesn't like LWP.
 
# HTML::Parser for scraping HTML.
my $p = new HTML::Parser (
   api_version => 3,
   start_h     => [\&tag, "tagname, attr"],
);

# Parallel::ForkManager to handle simultaneous downloads.
my $pfm = new Parallel::ForkManager($opt{'procs'});

#------------------------------------------------------------#
# Parse each page of HTML for images.  Stored in @images.    #
#------------------------------------------------------------#
# $start will be passed to google to tell it which page of   #
# results to display.  20 images per page.                   #
#------------------------------------------------------------#
# $test is used to see if we need another page.              #
#------------------------------------------------------------#
my @images;
my $done = 0;
my $page = 1;
until ($done) {
   $opt{'verbose'} && print "Fetching page " . $page++ . " of results.
+\n";
   my $test = $start;
   my $req = HTTP::Request->new(GET => $url . "\&start=$start");
   $p->parse($ua->request($req)->content);
   $done = 1 if $test == $start;
}

#------------------------------------------------------------#
# Fetch all images stored in @images.                        #
#------------------------------------------------------------#
foreach my $img (@images) {

   # Fork a child to execute code in this loop.
   $pfm->start and next;

   # Get our image URL, refering URL and a unique filename.
   my ($imgurl, $filename, $refurl) = @$img;
   $filename = unique($filename);

   $opt{'verbose'} && print "Fetching $imgurl as $filename\n";

   # Download the image and save it to disk.
   my $req = HTTP::Request->new(GET => "http://$imgurl");
   $req->referer($refurl);
   $ua->request($req, "$opt{'dir'}/$filename");

   # Indicate this child process is finished.
   $pfm->finish;
}

#------------------------------------------------------------#
# Wait for all children to finish and exit cleanly.          #
#------------------------------------------------------------#
$pfm->wait_all_children;
exit 0;

#------------------------------------------------------------#
# tag() is our HTML::Parser callback for handling start tags #
#------------------------------------------------------------#
sub tag {
   my ($tagname, $attr) = (@_);

   #
   # If we see the "nav_next.gif" image, we know we should go
   # to the next page to collect more images.  $start is our
   # offset for the next page.
   #
   if ($attr->{'src'} && ($attr->{'src'} eq "/nav_next.gif" )) {
      $start += 20;
   }

   #
   # Look for links to "imgres".  This will show our image URL
   # and the page it's used on.  We'll use the latter to spoof
   # our refering URL in case the host doesn't allow offsite
   # image linking (tripod, etc.).
   #
   return unless ($tagname eq 'a');
   return unless (
      $attr->{'href'} =~ /imgres\?imgurl=(.*\/([^\&]*))\&imgrefurl=([^
+\&]*)\&/
   );

   #
   # We've got a real image, so we'll remember it for downloading.
   #
   push(@images, [ $1, $2, $3 ]); # imgurl, filename, refurl

}

#------------------------------------------------------------#
# unique() ensures we're not overwriting existing files by   #
# returning an unused filename based on the one provided.    #
#------------------------------------------------------------#
sub unique {
   my $f = shift;
   return $f unless -e "$opt{'dir'}/$f";

   my $count = 1;
   while (-e "$opt{'dir'}/$count.$f") {
      $count++;
   }

   return "$count.$f";
}

#------------------------------------------------------------#
# help() displays usage information.                         #
#------------------------------------------------------------#
sub help {

print <<ENDHELP

$0 scrapes images.google.com for images matching the keyword
specified on the commandline.  Images are downloaded and placed
in the current directory by default.

Usage:   $0 --query "image keyword(s)" [OPTIONS]

Options:

   --query string  Search string for images.
                   Required.  No default.

   --verbose       Show what the script is doing as it goes.
                   Defaults to off.
                
   --safe          Use google's safesearch to filter naughty pictures.
                   Defaults to off.

   --procs n       Number of simultaneous image downloads to run.
                   Defaults to 20.

   --dir path      Directory to store downloaded images to.
                   Defaults to "." (current directory)

   --ua string     images.google.com doesn't like robots.  This is
                   the user-agent string we spoof.
                   Defaults to "Mozilla/1.0"

   --help          You're looking at it, cowboy.

Notes:

   Images are given unique filenames by prepending a number.  For
   example, "10.header.jpg"

   Usage may violate Google's TOS.  Use at your own risk.

ENDHELP

}
[download]

In reply to Scrape Google's Image Search by Hutta

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.