Gangabass has asked for the wisdom of the Perl Monks concerning the following question:
#!/usr/bin/perl use strict; use warnings; use WWW::Mechanize; my $main_catalogue_url = "http://somesite.ru/catalog.aspx?&cityId=3"; my $city_id = 3; my $city = "CityName"; my $mech = WWW::Mechanize->new(); #Get main links (first level) my $resfl = $mech->get( $main_catalogue_url ); my $resfl_result = $resfl->content; while ($resfl_result =~ m{<li><a href='catalog\.aspx\?rubricId=(\d+)&f +ormat=png&width=220&height=185&cityId=\d+'>([^<]+)</a></li>}g) { my ($fl_rubrik_id, $fl_rubrik_name) = ($1, $2); #get second level links my $ressl = $mech->get( "http://somesite.ru/catalog.aspx?rubricId= +$fl_rubrik_id&format=png&width=220&height=185&cityId=$city_id" ); my $ressl_result = $ressl->content; while ($ressl_result =~ m{<li><a href='catalog\.aspx\?rubricId=(\d ++)&format=png&width=220&height=185&cityId=\d+'>([^<]+)</a></li>}g) { my ($sl_rubrik_id, $sl_rubrik_name) = ($1, $2); #get third level links my $resfil = $mech->get("http://somesite.ru/catalog.aspx?rubricId= +$sl_rubrik_id&format=png&width=220&height=185&cityId=$city_id"); my $resfil_result = $resfil->content; while ($resfil_result =~ m{<li><a href='catalog\.aspx\?rubricId=(\ +d+)&format=png&width=220&height=185&cityId=\d+'>([^<]+)</a></li>}g) {
print "=" x 20, "\n"; my ($fil_rubrik_id, $fil_rubrik_name) = ($1, $2); my $offset = 0; #get firms ORG_LIST: my $resorgs = $mech->get("http://somesite.ru/catalog.aspx?rubr +icId=$fil_rubrik_id&format=png&width=220&height=185&cityId=$city_id&o +ffset=" . $offset++); my (@firms) = $resorgs->content =~ m{<li><a href='catalog\.asp +x\?firmId=(\d+)&format=png&width=220&height=185&cityId=$city_id'}g; foreach my $firm_id (@firms) { my $res = $mech->get("http://somesite.ru/catalog.aspx?firmId=$ +firm_id&format=png&width=220&height=185&cityId=$city_id"); my ($name) = $res->content =~ m{<h1>([^<]+)</h1>}s; #name #get sections with different addresses my $result = $res->content; while ($result =~ m{<p></p>(\s+<p>.+?</p>\s+<p>.+?</p>\s+<p>.+ +?</p>)}sg) { my $firm = $1; my ($address) = $firm =~ m{class="address">([^<]+)</a>}s; my ($phone) = $firm =~ m{<p>\s+<a href='map\.aspx?[^>]+>[^ +<]+</a>\s+</p>\s+<p>\s+(.+?)</p>}s; if ($phone) { $phone =~ s/\r\n/ /g; $phone =~ s/<br>/; /g; } my ($www) = $firm =~ m{<a href="http://[^"]+" target="_bla +nk">([^<]+)</a>}s; #URL my ($email) = $firm =~ m{<a href="mailto:[^"]+">([^<]+)<br +>}s; #URL foreach ($fl_rubrik_name, $sl_rubrik_name, $fil_rubrik_nam +e, $city, $name, $address, $phone, $www, $email) { if ($_) { s/^\s+//g; s/\s+$//g; s/\s+/ /g; } else { $_ = ""; } } open TEST, ">>", "_$city_id.txt" or die $!; print TEST join("\t", ($fl_rubrik_name, $sl_rubrik_name, $ +fil_rubrik_name, $city, $name, $address, $phone, $www, $email)), "\n" +; close TEST; } } if ($resorgs->content =~ m{<img src="images/but_redo\.gif" bor +der="0">}) { goto ORG_LIST; } } } }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Out of Memory problem
by perrin (Chancellor) on Jun 17, 2007 at 05:04 UTC | |
by Gangabass (Vicar) on Jun 17, 2007 at 14:50 UTC | |
|
Re: Out of Memory problem
by Anonymous Monk on Jun 17, 2007 at 01:54 UTC | |
by Gangabass (Vicar) on Jun 17, 2007 at 02:16 UTC | |
by naikonta (Curate) on Jun 17, 2007 at 11:04 UTC | |
by Gangabass (Vicar) on Jun 17, 2007 at 15:07 UTC |