in reply to Re: Out of Memory problem
in thread Out of Memory problem

OK, I'll try $mech->links. But can you show me the right way to use regexp in my situation?

Replies are listed 'Best First'.
Re^3: Out of Memory problem
by naikonta (Curate) on Jun 17, 2007 at 11:04 UTC
    Please, try it first. The docs is full of examples. There are also perlre and perlretut. Feel free to ask after you try and something doesn't work as you expect, and don't forget to include relevant code snippet.

    Open source softwares? Share and enjoy. Make profit from them if you can. Yet, share and enjoy!

      OK, I try it. Nothing changes :-(
      Here is my new code:
      #!/usr/bin/perl use strict; use warnings; use WWW::Mechanize; my $city_id = 9; my $main_catalogue_url = "http://somesite.ru/catalog.aspx?&cityId=$cit +y_id"; my $city = "CityName"; my $mech = WWW::Mechanize->new(); #$mech->stack_depth(0); #i think this is a problem $mech->get( $main_catalogue_url ); #getting first level links my (@L1_list) = $mech->find_all_links( url_regex => qr/catalog\.aspx\? +rubricId=\d+/io ); #i add /o switch. why? i'm don't know :-) foreach my $L1_link (@L1_list) { my $L1_rubrik = $L1_link->text(); #first level rubrik name $mech->get( $L1_link->url() ); #getting second level links my (@L2_list) = $mech->find_all_links( url_regex => qr/catalog\.as +px\?rubricId=\d+/io ); #/o switch again. is this right? foreach my $L2_link (@L2_list) { my $L2_rubrik = $L2_link->text(); #second level rubrik name $mech->get( $L2_link->url() ); #getting third level links my (@L3_list) = $mech->find_all_links( url_regex => qr/catalog\.as +px\?rubricId=\d+/io ); #yes, /o switch foreach my $L3_link (@L3_list) { print "=" x 20, "\n"; my $L3_rubrik = $L3_link->text(); #third level rubrik name my $offset = 0; #getting rubriks results ORG_LIST: my $firms = $mech->get( $L3_link->url() . "&offset=" . $offset +++ ); my (@firms) = $mech->find_all_links( url_regex => qr/catalog\. +aspx\?firmId=\d+/io ); foreach my $FIRM_link (@firms) { my $res = $mech->get( $FIRM_link->url() ); my ($name) = $res->content =~ m{<h1>([^<]+)</h1>}so; my $result = $res->content; while ($result =~ m{<p></p>(\s+<p>.+?</p>\s+<p>.+?</p>\s+<p>.+ +?</p>)}sgo) { my $firm = $1; my ($address) = $firm =~ m{class="address">([^<]+)</a>}so; my ($phone) = $firm =~ m{<p>\s+<a href='map\.aspx?[^>]+>[^ +<]+</a>\s+</p>\s+<p>(.+?)</p>}so; if ($phone) { $phone =~ s/\r\n/ /go; $phone =~ s/<br>/; /go; } my ($www) = $firm =~ m{<a href="http://[^"]+" target="_bla +nk">([^<]+)</a>}so; my ($email) = $firm =~ m{<a href="mailto:[^"]+">([^<]+)</a +>}so; foreach ($L1_rubrik, $L2_rubrik, $L3_rubrik, $city, $name, + $address, $phone, $www, $email) { if ($_) { s/^\s+//go; s/\s+$//go; s/\s+/ /go; } else { $_ = ""; } } open TEST, ">>", "2gis_$city_id.txt" or die $!; print TEST join("\t", ($L1_rubrik, $L2_rubrik, $L3_rubrik, + $city, $name, $address, $phone, $www, $email)), "\n"; close TEST; } } if ($firms->content =~ m{<img src="images/but_redo\.gif" borde +r="0">}o) { goto ORG_LIST; } } } }
      Can you explain how usage of regex lead to out of memory (in my situation)?