OK, I try it. Nothing changes :-(
Here is my new code:
#!/usr/bin/perl
use strict;
use warnings;
use WWW::Mechanize;
my $city_id = 9;
my $main_catalogue_url = "http://somesite.ru/catalog.aspx?&cityId=$cit
+y_id";
my $city = "CityName";
my $mech = WWW::Mechanize->new();
#$mech->stack_depth(0); #i think this is a problem
$mech->get( $main_catalogue_url );
#getting first level links
my (@L1_list) = $mech->find_all_links( url_regex => qr/catalog\.aspx\?
+rubricId=\d+/io ); #i add /o switch. why? i'm don't know :-)
foreach my $L1_link (@L1_list) {
my $L1_rubrik = $L1_link->text(); #first level rubrik name
$mech->get( $L1_link->url() );
#getting second level links
my (@L2_list) = $mech->find_all_links( url_regex => qr/catalog\.as
+px\?rubricId=\d+/io ); #/o switch again. is this right?
foreach my $L2_link (@L2_list) {
my $L2_rubrik = $L2_link->text(); #second level rubrik name
$mech->get( $L2_link->url() );
#getting third level links
my (@L3_list) = $mech->find_all_links( url_regex => qr/catalog\.as
+px\?rubricId=\d+/io ); #yes, /o switch
foreach my $L3_link (@L3_list) {
print "=" x 20, "\n";
my $L3_rubrik = $L3_link->text(); #third level rubrik name
my $offset = 0;
#getting rubriks results
ORG_LIST:
my $firms = $mech->get( $L3_link->url() . "&offset=" . $offset
+++ );
my (@firms) = $mech->find_all_links( url_regex => qr/catalog\.
+aspx\?firmId=\d+/io );
foreach my $FIRM_link (@firms) {
my $res = $mech->get( $FIRM_link->url() );
my ($name) = $res->content =~ m{<h1>([^<]+)</h1>}so;
my $result = $res->content;
while ($result =~ m{<p></p>(\s+<p>.+?</p>\s+<p>.+?</p>\s+<p>.+
+?</p>)}sgo) {
my $firm = $1;
my ($address) = $firm =~ m{class="address">([^<]+)</a>}so;
my ($phone) = $firm =~ m{<p>\s+<a href='map\.aspx?[^>]+>[^
+<]+</a>\s+</p>\s+<p>(.+?)</p>}so;
if ($phone) {
$phone =~ s/\r\n/ /go;
$phone =~ s/<br>/; /go;
}
my ($www) = $firm =~ m{<a href="http://[^"]+" target="_bla
+nk">([^<]+)</a>}so;
my ($email) = $firm =~ m{<a href="mailto:[^"]+">([^<]+)</a
+>}so;
foreach ($L1_rubrik, $L2_rubrik, $L3_rubrik, $city, $name,
+ $address, $phone, $www, $email) {
if ($_) {
s/^\s+//go;
s/\s+$//go;
s/\s+/ /go;
}
else {
$_ = "";
}
}
open TEST, ">>", "2gis_$city_id.txt" or die $!;
print TEST join("\t", ($L1_rubrik, $L2_rubrik, $L3_rubrik,
+ $city, $name, $address, $phone, $www, $email)), "\n";
close TEST;
}
}
if ($firms->content =~ m{<img src="images/but_redo\.gif" borde
+r="0">}o) {
goto ORG_LIST;
}
}
}
}
Can you explain how usage of regex lead to out of memory (in my situation)? |