Beefy Boxes and Bandwidth Generously Provided by pair Networks
Do you know where your variables are?
 
PerlMonks  

Scrape Google's Image Search

by Hutta (Scribe)
on Aug 29, 2003 at 19:10 UTC ( [id://287799]=sourcecode: print w/replies, xml ) Need Help??
Category: Web Stuff
Author/Contact Info Kris Gale
Description: Script to get all images in Google's image search engine matching specified keyword(s). Image downloads are done in parallel and spoof the refering URL in case the host protects against offsite linking. If you just want a single image for a background, see hossman's Random Background from Net.
#!/usr/bin/perl -w

#------------------------------------------------------------#
# Scrape images.google.com for images matching a specific    #
# keyword.                                                   #
#------------------------------------------------------------#
# ./imgo.pl --query "perl monks"                             #
#------------------------------------------------------------#
use HTML::Parser;
use LWP::UserAgent;
use Parallel::ForkManager;
use Getopt::Long;
use URI::Escape;
use strict;

#------------------------------------------------------------#
# Options and other variables we'll need.                    #
#------------------------------------------------------------#

# Defaults
my %opt = (
   dir   => ".",
   safe  => "0",
   procs => "20",
   ua    => "Mozilla/1.0",
   query => "",
);

# Options from the commandline.
GetOptions( 
   'verbose' => \$opt{'verbose'},
   'help'    => \$opt{'help'   },
   'safe'    => \$opt{'safe'   },
   'query=s' => \$opt{'query'  },
   'procs=i' => \$opt{'procs'  },
   'ua=s'    => \$opt{'ua'     },
   'dir=s'   => \$opt{'dir'    },
);

# Compose our base URL for images.google.com.
$opt{'query'} = uri_escape($opt{'query'});

my $url = "http://images.google.com/images" . 
   "?q=$opt{'query'}" .
   "\&safe=" . ($opt{'safe'} ? "on" : "off");

# Initial image offset (Page 1 of results)
my $start = "0";

# Validate input and display help if needed.
&help if ($opt{'help'} || !$opt{'query'});

#------------------------------------------------------------#
# Create objects we'll need.                                 #
#------------------------------------------------------------#

# LWP for HTTP requests.
my $ua = new LWP::UserAgent;
$ua->agent($opt{'ua'}); # Google doesn't like LWP.
 
# HTML::Parser for scraping HTML.
my $p = new HTML::Parser (
   api_version => 3,
   start_h     => [\&tag, "tagname, attr"],
);

# Parallel::ForkManager to handle simultaneous downloads.
my $pfm = new Parallel::ForkManager($opt{'procs'});

#------------------------------------------------------------#
# Parse each page of HTML for images.  Stored in @images.    #
#------------------------------------------------------------#
# $start will be passed to google to tell it which page of   #
# results to display.  20 images per page.                   #
#------------------------------------------------------------#
# $test is used to see if we need another page.              #
#------------------------------------------------------------#
my @images;
my $done = 0;
my $page = 1;
until ($done) {
   $opt{'verbose'} && print "Fetching page " . $page++ . " of results.
+\n";
   my $test = $start;
   my $req = HTTP::Request->new(GET => $url . "\&start=$start");
   $p->parse($ua->request($req)->content);
   $done = 1 if $test == $start;
}

#------------------------------------------------------------#
# Fetch all images stored in @images.                        #
#------------------------------------------------------------#
foreach my $img (@images) {

   # Fork a child to execute code in this loop.
   $pfm->start and next;

   # Get our image URL, refering URL and a unique filename.
   my ($imgurl, $filename, $refurl) = @$img;
   $filename = unique($filename);

   $opt{'verbose'} && print "Fetching $imgurl as $filename\n";

   # Download the image and save it to disk.
   my $req = HTTP::Request->new(GET => "http://$imgurl");
   $req->referer($refurl);
   $ua->request($req, "$opt{'dir'}/$filename");

   # Indicate this child process is finished.
   $pfm->finish;
}

#------------------------------------------------------------#
# Wait for all children to finish and exit cleanly.          #
#------------------------------------------------------------#
$pfm->wait_all_children;
exit 0;

#------------------------------------------------------------#
# tag() is our HTML::Parser callback for handling start tags #
#------------------------------------------------------------#
sub tag {
   my ($tagname, $attr) = (@_);

   #
   # If we see the "nav_next.gif" image, we know we should go
   # to the next page to collect more images.  $start is our
   # offset for the next page.
   #
   if ($attr->{'src'} && ($attr->{'src'} eq "/nav_next.gif" )) {
      $start += 20;
   }

   #
   # Look for links to "imgres".  This will show our image URL
   # and the page it's used on.  We'll use the latter to spoof
   # our refering URL in case the host doesn't allow offsite
   # image linking (tripod, etc.).
   #
   return unless ($tagname eq 'a');
   return unless (
      $attr->{'href'} =~ /imgres\?imgurl=(.*\/([^\&]*))\&imgrefurl=([^
+\&]*)\&/
   );

   #
   # We've got a real image, so we'll remember it for downloading.
   #
   push(@images, [ $1, $2, $3 ]); # imgurl, filename, refurl

}

#------------------------------------------------------------#
# unique() ensures we're not overwriting existing files by   #
# returning an unused filename based on the one provided.    #
#------------------------------------------------------------#
sub unique {
   my $f = shift;
   return $f unless -e "$opt{'dir'}/$f";

   my $count = 1;
   while (-e "$opt{'dir'}/$count.$f") {
      $count++;
   }

   return "$count.$f";
}

#------------------------------------------------------------#
# help() displays usage information.                         #
#------------------------------------------------------------#
sub help {

print <<ENDHELP

$0 scrapes images.google.com for images matching the keyword
specified on the commandline.  Images are downloaded and placed
in the current directory by default.

Usage:   $0 --query "image keyword(s)" [OPTIONS]

Options:

   --query string  Search string for images.
                   Required.  No default.

   --verbose       Show what the script is doing as it goes.
                   Defaults to off.
                
   --safe          Use google's safesearch to filter naughty pictures.
                   Defaults to off.

   --procs n       Number of simultaneous image downloads to run.
                   Defaults to 20.

   --dir path      Directory to store downloaded images to.
                   Defaults to "." (current directory)

   --ua string     images.google.com doesn't like robots.  This is
                   the user-agent string we spoof.
                   Defaults to "Mozilla/1.0"

   --help          You're looking at it, cowboy.

Notes:

   Images are given unique filenames by prepending a number.  For
   example, "10.header.jpg"

   Usage may violate Google's TOS.  Use at your own risk.

ENDHELP

}
Replies are listed 'Best First'.
Re: Scrape Google's Image Search
by Anonymous Monk on Sep 07, 2004 at 12:31 UTC
    this works really well, but I found that (at least under mac os x) I had to change the following:
    
     # Download the image and save it to disk.
       my $req = HTTP::Request->new(GET => "http://$imgurl");
     
    to 
    
     # Download the image and save it to disk.
       my $req = HTTP::Request->new(GET => $imgurl);
      
    as the original version would just try to fetch "." and puke out. 
    
    I don't know if that's due to what's on the mac, or no, but at least for me $imgurl already passes "http://link.to.image/image.jpg". 
    
    so, just make that tiny change if you're running into problems fetching stuff. 
    
    a very nifty app, very fun to max out your CPU fetching an ungodly number of images!
    
Re: Scrape Google's Image Search
by abhihimself (Acolyte) on Aug 19, 2014 at 08:33 UTC
    Hello ALL, I tried to use this code but its not working. After debugging I came to know that @image array is not getting any value that's why foreach loop is not working . Any suggestion what should i change in current code provided. I would highly appreciate it. Thanks, Abhi

      Use the API offered by Google should you wish to use their services.

      Hello ALL, I tried to use this code but its not working.

      Its been 11 years, google changed, thats life

Re: Scrape Google's Image Search
by abhihimself (Acolyte) on Aug 21, 2014 at 08:16 UTC
    Hello, Thanks for your responses :). I will try another approaches like Google API to work this out.

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://287799]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others surveying the Monastery: (4)
As of 2024-03-28 20:00 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found