Kanishka has asked for the wisdom of the Perl Monks concerning the following question:
#!/usr/bin/perl -w use strict; use LWP::UserAgent; use HTTP::Cookies; system 'del BechtleResult.txt'; initHttp(); login(); exit 0; sub login{ local $1; local $2; my $content; $content = sendRequest('GET', 'http://www.bechtle.be/'); $content =~ m@\<frame frameborder="0" name="[^"]*" noresize scroll +ing="auto"[ \n\r\t]*?src="([^"]*)"@s; # [ \t\n\r]*?src="([^"]*)" my $url = $1; $content = sendRequest('GET',$url); my $count = 0; while ($content=~ m@javascript:self.parent.sendRequest\('([^']*)'\ +).*?>([^<]+?)</a>@g ) { $url = $1; my $desc = $2; if ($count > 0) { #we ignore the first node since its the titl +e print "> $desc\n"; my $rs = sendRequest('GET',$url); parseMainNodes($desc, $rs); } $count ++; } } sub parseMainNodes { local $1; local $2; #my ($category, $content2) = @_; my $category = shift; my $content2 = shift; while ($content2=~ m@javascript:self.parent.sendRequest\('([^']*ex +pand=true[^']*)'\).*?>([^<]+?)</a>@g ) { my $url = $1; my $desc = $2; #print "$url\n"; print "> $category -> $desc\n"; my $rs = sendRequest('GET',$url); #print $url."\n"; parseSubNodes($category, $desc, $rs); #print $url."\n"; } } sub parseSubNodes { local $1; local $2; #my ($category, $section, $content1) = @_; my $category = shift; my $section = shift; my $content1 = shift; $content1 =~ s@<!-- Produktvergleich -->.*@@s; #remove the junk af +ter the tree section while ($content1=~ m@javascript:self.parent.sendRequest\('([^']*no +dechip_id=[^']*)'\).*?>([^<]+?)</a>@g ) { htmDump($content1); my $url = $1; my $desc = $2; print "> $category -> $section -> $desc\n"; getLstPage($url); } } sub getLstPage{ local $1; local $2; my $prdLstUrl = shift; my $rs1 = sendRequest('GET',$prdLstUrl); #lstPgDump($rs); extractPrd($rs1); } ###################### sub extractPrd{ local $1; local $2; my $lstPgDump = shift; print "fetching product page \n"; $lstPgDump =~ s!.*?Bestellen!!s; $lstPgDump =~ s!(.*Op fabrikant).*!$1!sg; my @prdLine = split (/<td style="text-align: center;">/, $lstPgDum +p); shift @prdLine; foreach my $prditem(@prdLine){ #print $prditem."\n"; processitem($prditem); } #print $lstPgDump; #htmDump($lstPgDump); } sub processitem{ my $item = shift; my @resLine; #desc if ($item =~ /title="([^"]*)"/){ my $dsc = $1; push (@resLine, $dsc); } #pnb if ($item =~ m!<span class="small">.*?<.*?>(.*?)<!s){ my $pnb = $1; push (@resLine, $pnb); } #price if ($item =~ m!<span class="price_brutto">[ \s\t]+(\&euro\;\ \ +;)*(.*?)<!s){ my $price = $2; $price =~ s/[\s]*$//; push (@resLine, $price); } my $result = (join('","',@resLine )); print "\"$result\"\n"; resDump($result); } sub shtmLoad { open OUT, '<lstPgDump.htm'; local $/ = undef; my $html = <OUT>; close OUT; return $html; } sub resDump{ my $reultLine = shift; open RESOUT, '>>BechtleResult.txt'; print RESOUT "\"$reultLine\"\n"; close RESOUT; } ################################3 sub htmDump{ my $html = shift; open OUT, '>htmlDump.htm'; print OUT $html; close OUT; } sub lstPgDump{ my $lstPg = shift; open OUT, '>lstPgDump.htm'; print OUT $lstPg; close OUT; } ############################################# my $lasturl; #last url fetched my $ua; #user agent for http requests my $cookies; #http cookies ############################################# #### #inits the useragent and inits the cookies file sub initHttp { $ua = LWP::UserAgent->new(); $ua->agent('Mozilla/5.0'); #$ua->proxy(http => 'http://mahasen:8080'); my $cookies = new HTTP::Cookies(); $ua->cookie_jar($cookies); } sub sendRequest { #print "Sleeping...\n"; #sleep(2); #print "Continuning...\n"; my ($method, $url, $content, $referer) = @_; #$method = shift; #$url = shift; #$content = shift; #$referer = shift; $referer=$lasturl unless defined($referer); my ($request, $response); $request = HTTP::Request->new($method, $url); if (defined($content)) { $request->content($content); $request->content_type("application/x-www-form-urlencoded"); $request->content_length(length($content)); } if (defined($referer)) { $request->referer($referer); } $request->header(Connection => "close"); #force no keep alive $response = $ua->request($request); #if the response has a refresh 0 then handle that as well my $refresh = $response->header("Refresh"); if (defined($refresh)) { $refresh =~ s!0; *URL=!!i; $refresh = makeurl($response->request->url, $refresh); $request = HTTP::Request->new("GET", $refresh); $response = $ua->request($request); } #if the resoponse has a Location then handle that as well $refresh = $response->header("Location"); if (defined($refresh)) { $refresh = makeurl($response->request->url, $refresh); $request = HTTP::Request->new("GET", $refresh); $response = $ua->request($request); } #print $response->as_string; # if ($response->code==207 || $response->code==200 || $response->code +==301) { if ($response->code==207 || $response->code==200) { $lasturl = $response->request->url; # return $response->as_string; return $response->content; } else { print "ERROR : Request \'$url\' Failed (".$response->status_line." +)!\n"; print "Last URL : ".$response->request->url."\n"; return undef; } } sub makeurl { my $currenturl = shift; my $newurl = shift; if ($newurl =~ /^http[s]?:/) { #make sure that the newurl is not a a +bsolute one return $newurl; } elsif ($newurl =~ /^\//) { #if the new url starts with a / then we + ignore the current path and take the hostname only $currenturl =~ m!([^:]+://[^/]+)!; $currenturl = $1.$newurl; return $currenturl; } else { #if the new url doesnt start with a / then we discard the l +ast page of current url and append the new url $currenturl =~ m!([^:]+://.+/)!; $currenturl = $1.$newurl; return $currenturl; } }
Edited by planetscape - added readmore tags
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Is this a memory rewriting or else....!
by Tanktalus (Canon) on Jan 24, 2006 at 05:07 UTC | |
|
Re: Is this a memory rewriting or else....!
by ikegami (Patriarch) on Jan 24, 2006 at 05:49 UTC | |
by davido (Cardinal) on Jan 24, 2006 at 06:33 UTC |