Well, it's that cold and dark time of the year again, time for the Sports-Illustrated-Swimsuits auto-download. This is a pretty fair sampling. Any code improvements welcome. This could be done easier with LWP::Simple, but I wanted to show some download progress(since each picture is =~ 400k, and throw in a little user-agent fakeout :-)) UPDATE 1-21-05 Miscellaneous fixes for binmode and server load.
#!/usr/bin/perl
use warnings;
use strict;
use LWP::UserAgent;
my $a = int rand(9);
my $a1 = int rand(9);
my $agent = "Mozilla/1.$a.$a1 (compatible; MSIE; NT 6.0 )";
my $ua = LWP::UserAgent->new(
env_proxy => 0,
timeout => 50,
keep_alive => 1,
agent => $agent,
);
my @pics = qw(
04_mandersen_01.jpg 04_mandersen_02.jpg 04_mandersen_03.jpg
04_mandersen_04.jpg 04_mandersen_05.jpg 04_mandersen_06.jpg
04_mandersen_07.jpg 04_alindvall_01.jpg 04_alindvall_02.jpg
04_alindvall_03.jpg 04_alindvall_04.jpg 04_akournikova_01.jpg
04_akournikova_02.jpg 04_akournikova_03.jpg 04_akournikova_04.jpg
04_bhall_01.jpg 04_bhall_02.jpg 04_bhall_03.jpg 04_bhall_04.jpg
04_bhall_05.jpg 04_bhall_06.jpg 04_cmurphy_01.jpg 04_cmurphy_02.jpg
04_cmurphy_03.jpg 04_cmurphy_04.jpg 04_dpestova_01.jpg
50th_dpestova_01.jpg 04_ebenitez_01.jpg 04_ebenitez_02.jpg
04_ebenitez_03.jpg 04_ebenitez_04.jpg 04_ebenitez_05.jpg
04_fmotta_01.jpg 04_fmotta_02.jpg 04_fmotta_03.jpg
04_fmotta_04.jpg 04_frayder_01.jpg 04_frayder_02.jpg
04_frayder_03.jpg 04_frayder_04.jpg 04_jvandersteen_01.jpg
04_jvandersteen_02.jpg 04_jvandersteen_03.jpg 04_jvandersteen_04.jpg
04_jwhite_01.jpg 04_jwhite_02.jpg 04_jwhite_03.jpg
04_jwhite_04.jpg 04_jwhite_05.jpg 04_jwhite_06.jpg
04_jwhite_07.jpg 04_mmiller_01.jpg 04_mmiller_02.jpg
04_mmiller_03.jpg 04_mmiller_04.jpg 04_mmiller_05.jpg
04_mmiller_01.jpg 50th_mmiller_01.jpg 04_mkeller_01.jpg
04_mkeller_02.jpg 04_mkeller_03.jpg 04_mkeller_04.jpg
04_mkeller_05.jpg 04_mkeller_06.jpg 04_msims_01.jpg
04_msims_02.jpg 04_msims_03.jpg 04_msims_04.jpg
04_pnemcova_01.jpg 04_pnemcova_02.jpg 04_pnemcova_03.jpg
04_pnemcova_04.jpg 04_pnemcova_05.jpg 04_pnemcova_06.jpg
04_vvarekova_01.jpg 04_vvarekova_02.jpg 04_vvarekova_03.jpg
04_vvarekova_04.jpg 01_vvarekova_01.jpg 00_vvarekova_01.jpg
99_vvarekova_01.jpg
);
foreach my $pic(@pics){
my $URL = "http://i.cnn.net/si/pr/subs/swimsuit/images/$pic";
my $filename = $pic;
my $expected_length;
my $bytes_received = 0;
my $result = $ua->head($URL);
if ($result->is_success) {
open( IN, ">$filename" ) or warn $!;
binmode(IN);
my $response = $ua->request(HTTP::Request->new(GET => $URL),
sub {
my ( $chunk, $res ) = @_;
$bytes_received += length($chunk);
unless ( defined $expected_length ) {
$expected_length = $res->content_length || 0;
}
if ($expected_length) {
printf STDERR "%d%% - ", 100 * $bytes_received / $expect
+ed_length;
}
print STDERR "$bytes_received bytes received $pic\r";
print IN $chunk;
}
);
print $response->status_line, "\n";
}else{print "$pic ",$result->status_line,"\n"}
close IN;
sleep(1 + rand(5));
}
exit;
Re: Swimsuits2004
by merlyn (Sage) on Jan 20, 2005 at 01:45 UTC
|
(Cue "spiderman" theme song...)
Here's my spider version for the current website, fetching everything from 1996 to 2004 (for now) and the ultimate and 50th edition images:
#!/usr/bin/perl
use strict;
$|++;
use LWP::Simple;
-d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $!
+";
my $all_model_index = get "http://sportsillustrated.cnn.com/swimsuit/c
+ollection/";
while ($all_model_index =~ /(\/swimsuit\/collection\/models\/[-\w]+\.h
+tml)/g) {
my $model_index = get "http://sportsillustrated.cnn.com/$1";
while ($model_index =~ /\"(http:\/\/i\.cnn\.net\/si\/pr\/subs\/swims
+uit\/images\/)([-\w]+)t\.jpg\"/g) {
my $url = "$1$2.jpg";
my $file = "RESULTS/$2.jpg";
print "$url => $file: ";
if (-e $file) {
print "skip\n";
} else {
print mirror($url, $file), "\n";
}
}
}
Amazingly enough, this also fetches the "private" images that you should only be able to get if you're registered. Apparently, although the HTML pages are protected with their login, the images themselves are not, and the image thumbnails give away the full image names. Cool.
And once you get the results, you can symlink them by person with this:
#!/usr/bin/perl
use strict;
$|++;
-d "SORTED" or mkdir "SORTED" or die "mkdir SORTED: $!";
for (glob "RESULTS/*") {
my($basename, $person) = /RESULTS\/(.*?_(.*?)_[\db]+\.jpg)$/
or die "$_";
my $dir = "SORTED/$person";
-d $dir or mkdir $dir or die "mkdir $dir: $!";
my $target = $basename;
for ($target) {
s/^9/199/ or s/^0/200/; # patch up years
$_ = "$dir/$_";
}
-e $target or symlink "../../$_", $target or die "ln -s ../../$_ $ta
+rget: $!";
}
| [reply] [Watch: Dir/Any] [d/l] [select] |
|
| [reply] [Watch: Dir/Any] [d/l] |
|
The 2005 results just went online... I'm fetching them with this:
#!/usr/bin/perl
use strict;
$|++;
use LWP::Simple;
-d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $!
+";
my $all_model_index = get "http://sportsillustrated.cnn.com/features/2
+005_swimsuit/";
while ($all_model_index =~ /(\/features\/2005_swimsuit\/models\/[-\w]+
+\.html)/g) {
my $model_index = get "http://sportsillustrated.cnn.com/$1";
while ($model_index =~ /\"(http:\/\/i\.a\.cnn\.net\/si\/features\/20
+05_swimsuit\/models\/images\/)([-\w]+)t\.jpg\"/g) {
my $url = "$1$2.jpg";
my $file = "RESULTS/$2.jpg";
print "$url => $file: ";
if (-e $file) {
print "skip\n";
} else {
print mirror($url, $file), "\n";
}
}
}
| [reply] [Watch: Dir/Any] [d/l] |
|
Mere minutes after the 2006 site went live, I reverse engineered it for this. This doesn't get the "subscription exclusive" shots as the prior versions did... they finally got smart and put it in a separate index. Nor does it grab the videos and a few of the other odd extra things. "Until next time, enjoy!"
#!/usr/bin/perl
use strict;
$|++;
use LWP::Simple;
-d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $!
+";
my $all_model_index = get "http://sportsillustrated.cnn.com/features/2
+006_swimsuit/";
while ($all_model_index =~ /(\/features\/2006_swimsuit\/(allstar|model
+s)\/[-\w]+\.html)/g) {
doit("$1");
}
doit("/features/2006_swimsuit/allstar/allstar_reunion.html");
doit("/features/2006_swimsuit/athletes/");
doit("/features/2006_swimsuit/painting/");
sub doit {
my $model_index = get "http://sportsillustrated.cnn.com/" . shift;
while ($model_index =~ /\'(http:\/\/i.a.cnn.net\/si\/features\/2006_
+swimsuit\/images\/gallery\/photos\/)([\w.\-]+)t.jpg\'/g) {
my $url = "$1$2.jpg";
my $file = "RESULTS/$2.jpg";
if (-e $file) {
print "$url => $file: ";
print "skip\n";
} else {
print "$url => $file: ";
print mirror($url, $file), "\n";
}
}
}
| [reply] [Watch: Dir/Any] [d/l] |
|
|
|
|
|
|
Merlyn, I'm curious about something...
When you're whiping up short little scripts like this, do you find yourself using syntax like /a\/b\/c\/d/g instead of m{a/b/c/d}g out of habit from the old days when there wasn't much choice, or is it a conscious choice?
| [reply] [Watch: Dir/Any] [d/l] [select] |
|
|
|
merlyn,
This certainly isn't as succinct but I wanted to offer my contribution. It gets 110 of the 118 non-exclusive pics. All 8 of Marisa Miller's are skipped. It was fun - *shrug*.
I didn't bother fixing it to get the exclusive photos, but it shouldn't be too difficult. Incidently, there is a non-linked photo we both miss. Carolyn Murphy has a hidden #2 pic.
| [reply] [Watch: Dir/Any] [d/l] |
|
As usual, Merlyn does it best. :-) Maybe you should offer your service to SI, to better protect their "private" content? Then again, maybe not! :-)
I'm not really a human, but I play one on earth.
flash japh
| [reply] [Watch: Dir/Any] |
|
#!/usr/bin/perl
use strict;
$|++;
my %files;
foreach my $file_name (glob "RESULTS/*") {
my ($top_level, $model_name) = ($file_name =~ m/RESULTS\/(.*?)_(.*?)
+_.*$/);
$top_level =~ s/^9/199/ or $top_level =~ s/^0/200/;
$files{$top_level}{$model_name}{$file_name} = 1;
$files{$model_name}{'all'}{$file_name} = 1;
}
-d 'HTML' or mkdir 'HTML' or die "mkdir HTML: $!";
open INDEX, ">HTML\\index.html";
foreach my $top_level (sort keys %files) {
print INDEX qq|<a href="./index_$top_level.html">$top_level</a><br/>
+\n|;
open TOP_INDEX, ">HTML\\index_$top_level.html";
foreach my $model_name (sort keys %{$files{$top_level}}) {
open MODEL_INDEX, ">HTML\\index_${top_level}_${model_name}.html";
my $image_count = 0;
foreach my $file_name (sort keys %{$files{$top_level}{$model_name}
+}) {
$image_count++;
print MODEL_INDEX qq|<img src="../$file_name"><br/>\n|;
}
print TOP_INDEX qq|<a href="./index_${top_level}_${model_name}.htm
+l">$model_name ($image_count)</a><br/>\n|;
}
}
| [reply] [Watch: Dir/Any] [d/l] |
|
Hmm...
Just could not help myself and create thumbs and links...
Have fun :-) (ps. feel free to improve it... that's the real fun, isn't it?)
#!/usr/bin/perl -w
use strict;
use Image::Magick;
use File::Glob qw(:globally :nocase);
use File::Basename;
use CGI;
my $realdir = qw( /var/www/html/thumbnailer/images/ ); # The dir tha
+t holds the origional images
my @ext = qw( jpg png gif); # Image exten
+tions to look for
my $savedir = qw( /var/www/html/thumbnailer/thumbs/ ); # Dir to save
+ thumbnails
my $serverdir = qw( /thumbnailer/ ); # Relative se
+rver-path
my $thumbdir = qw( thumbs/ ); # Holds the t
+humbnails for the webpage
my $imagedir = qw( images/ ); # Holds the r
+eal images for the webpage
my $x_size = 150; # Size in pix
+els.
my $y_size = 150; # Size in pix
+els.
my $resize = 1; # Resize befo
+re crop.
my $aspect_S = 0.5; # Only reseze
+ if aspect-ratio is above this value, else thumbnail becomes to blurr
+ed.
my $aspect_H = 2; # Only resize
+ if aspect-ratio is below this value, else thumbnaik becomes to blurr
+ed.
my $overwrite = 0; # Allways rem
+ake (overwrite) the thumbnails.
my $cols = 5; # Max horizon
+tal thumbs.
my $rows = 10; # Max vertica
+l thumbs.
my $cgi = new CGI;
main();
cleanUp();
sub main {
my $content = "<tr>";
my $files = readDir();
my $thumbs_per_page = $rows * $cols;
my $total = scalar(@$files) ? scalar(@$files) : 0;
my $pages = $total / $thumbs_per_page;
my $currentPage = $cgi->param('p') ? $cgi->param('p') : 1;
my $hasPrevious = $currentPage-1 ? 1 : 0;
my $hasNext = ($currentPage < $pages) ? 1 : 0 ;
my $startImage = (($currentPage-1) * $thumbs_per_page) ;
my $nav = "";
my $c = 1;
my $i = 0;
foreach my $file (@$files) {
$i++;
if ($i >= $total) {
$nav .= "<tr><td align=\"center\" nowrap=\"now
+rap\" colspan=\"$cols\">";
if ($hasPrevious) { $nav .= "<a href=\"?p=" .
+($currentPage - 1) . "\">Previous<\/a>\ \ "; }
if ($hasNext) { $nav .= "<a href=\"?p=" .
+($currentPage + 1) . "\">Next<\/a>"; }
$nav .= "<\/td><\/tr>";
}
next if ($i <= $startImage || $i > ($startImage + $thu
+mbs_per_page));
if ($c > $cols) {
$content .= "<\/tr><tr>\n";
$c = 1;
}
# Check if the file alreaddy exists:
my $filename = "thumb_" . fileparse($file);
if (!-e $savedir . $filename || $overwrite) {
# Make new thumbnails...
my $image = Image::Magick->new;
$image->Read($file);
my ($x,$y) = $image->Get('width', 'height');
# Enlarge image if thumbnail > origional, or r
+esize before crop is enabled...
if ($x < $x_size || $resize) {
my $aspectratio = $y / $x;
# Only resize if aspect-ratio is betwe
+en given apect ratio-borders
if ($aspectratio > $aspect_S && $aspec
+tratio < $aspect_H || $x < $x_size) {
$x = $x_size;
$y=$x * $aspectratio;
$image->Resize(width => $x, he
+ight => $y, filter => 'Cubic', blur => 1);
}
}
if ($y < $y_size) {
my $aspectratio = $x / $y;
$y = $y_size;
$x=$y * $aspectratio;
$image->Resize(width => $x, height =>
+$y, filter => 'Cubic', blur => 1);
}
# Get center (offset) of image, and crop to $x
+_size * $y_size.
my $ox = ($x - $x_size) / 2;
my $oy = ($y - $y_size) / 2;
$image->Crop("${x_size}x${y_size}+$ox+$oy");
$image->Write($savedir.$filename);
$content .= " <td> <a href=\"" . $serverdir .
+$imagedir . fileparse($file) . "\" > <img src=" . $serverdir . $thumb
+dir . $filename. " alt=\"\" border=\"1\"> <\/a><\/td> ";
} else {
# Skip writing...
$content .= " <td> <a href=\"" . $serverdir .
+$imagedir . fileparse($file) . "\" > <img src=" . $serverdir . $thumb
+dir . $filename. " alt=\"\" border=\"2\"> <\/a><\/td> ";
}
$c++;
}
$content .= "<\/tr>\n" . $nav;
printHtml($content);
}
sub printHtml {
my ($content) = @_;
# my $cgi = new CGI;
print $cgi->header(-type => 'text/html', );
print $cgi->start_html( -title => 'Testpage',
-BGCOLOR => '#ffffff',);
print "<table border=\"0\" cellpadding=\"0\" cellspacing=\"3\"
+>\n";
print $content;
print "\n<\/table>\n";
print $cgi->end_html;
}
sub readDir {
my $files="*.{" . join(",",@ext) . "}";
my @files= glob($realdir.$files);
return \@files;
}
sub cleanUp {
undef $cgi, $realdir, @ext, $serverdir, $savedir, $x_size, $co
+ls;
undef $y_size, $resize, $aspect_S , $aspect_H, $overwrite, $ro
+ws;
undef $thumbdir, $imagedir;
}
| [reply] [Watch: Dir/Any] [d/l] |
Re: Swimsuits2004
by holli (Abbot) on Jan 20, 2005 at 20:02 UTC
|
Your code breaks on MS-Windows. But putting binmode(IN); after open( IN, ">$filename" ) or warn $!; fixes that. Btw, why do you use "IN" as a name for a handle you are actually writing to?
| [reply] [Watch: Dir/Any] [d/l] [select] |
|
I use IN, because from my view, it's an "incoming file".
I'm not really a human, but I play one on earth.
flash japh
| [reply] [Watch: Dir/Any] |
|
open my $in, '>:raw', $filename or warn $!;
| [reply] [Watch: Dir/Any] [d/l] |
Re: Swimsuits2004
by Limbic~Region (Chancellor) on Jan 19, 2005 at 22:41 UTC
|
| [reply] [Watch: Dir/Any] |
|
|