#!/usr/bin/perl -w
use strict;
use LWP::UserAgent;
use HTTP::Cookies;
system 'del BechtleResult.txt';
initHttp();
login();
exit 0;
sub login{
local $1;
local $2;
my $content;
$content = sendRequest('GET', 'http://www.bechtle.be/');
$content =~ m@\([^<]+?)@g ) {
$url = $1;
my $desc = $2;
if ($count > 0) { #we ignore the first node since its the title
print "> $desc\n";
my $rs = sendRequest('GET',$url);
parseMainNodes($desc, $rs);
}
$count ++;
}
}
sub parseMainNodes {
local $1;
local $2;
#my ($category, $content2) = @_;
my $category = shift;
my $content2 = shift;
while ($content2=~ m@javascript:self.parent.sendRequest\('([^']*expand=true[^']*)'\).*?>([^<]+?)@g ) {
my $url = $1;
my $desc = $2;
#print "$url\n";
print "> $category -> $desc\n";
my $rs = sendRequest('GET',$url);
#print $url."\n";
parseSubNodes($category, $desc, $rs);
#print $url."\n";
}
}
sub parseSubNodes {
local $1;
local $2;
#my ($category, $section, $content1) = @_;
my $category = shift;
my $section = shift;
my $content1 = shift;
$content1 =~ s@.*@@s; #remove the junk after the tree section
while ($content1=~ m@javascript:self.parent.sendRequest\('([^']*nodechip_id=[^']*)'\).*?>([^<]+?)@g ) {
htmDump($content1);
my $url = $1;
my $desc = $2;
print "> $category -> $section -> $desc\n";
getLstPage($url);
}
}
sub getLstPage{
local $1;
local $2;
my $prdLstUrl = shift;
my $rs1 = sendRequest('GET',$prdLstUrl);
#lstPgDump($rs);
extractPrd($rs1);
}
######################
sub extractPrd{
local $1;
local $2;
my $lstPgDump = shift;
print "fetching product page \n";
$lstPgDump =~ s!.*?Bestellen!!s;
$lstPgDump =~ s!(.*Op fabrikant).*!$1!sg;
my @prdLine = split (/
/, $lstPgDump);
shift @prdLine;
foreach my $prditem(@prdLine){
#print $prditem."\n";
processitem($prditem);
}
#print $lstPgDump;
#htmDump($lstPgDump);
}
sub processitem{
my $item = shift;
my @resLine;
#desc
if ($item =~ /title="([^"]*)"/){
my $dsc = $1;
push (@resLine, $dsc);
}
#pnb
if ($item =~ m!.*?<.*?>(.*?)[ \s\t]+(\&euro\;\ \;)*(.*?);
close OUT;
return $html;
}
sub resDump{
my $reultLine = shift;
open RESOUT, '>>BechtleResult.txt';
print RESOUT "\"$reultLine\"\n";
close RESOUT;
}
################################3
sub htmDump{
my $html = shift;
open OUT, '>htmlDump.htm';
print OUT $html;
close OUT;
}
sub lstPgDump{
my $lstPg = shift;
open OUT, '>lstPgDump.htm';
print OUT $lstPg;
close OUT;
}
#############################################
my $lasturl; #last url fetched
my $ua; #user agent for http requests
my $cookies; #http cookies
#############################################
####
#inits the useragent and inits the cookies file
sub initHttp {
$ua = LWP::UserAgent->new();
$ua->agent('Mozilla/5.0');
#$ua->proxy(http => 'http://mahasen:8080');
my $cookies = new HTTP::Cookies();
$ua->cookie_jar($cookies);
}
sub sendRequest {
#print "Sleeping...\n";
#sleep(2);
#print "Continuning...\n";
my ($method, $url, $content, $referer) = @_;
#$method = shift;
#$url = shift;
#$content = shift;
#$referer = shift;
$referer=$lasturl unless defined($referer);
my ($request, $response);
$request = HTTP::Request->new($method, $url);
if (defined($content)) {
$request->content($content);
$request->content_type("application/x-www-form-urlencoded");
$request->content_length(length($content));
}
if (defined($referer)) {
$request->referer($referer);
}
$request->header(Connection => "close"); #force no keep alive
$response = $ua->request($request);
#if the response has a refresh 0 then handle that as well
my $refresh = $response->header("Refresh");
if (defined($refresh)) {
$refresh =~ s!0; *URL=!!i;
$refresh = makeurl($response->request->url, $refresh);
$request = HTTP::Request->new("GET", $refresh);
$response = $ua->request($request);
}
#if the resoponse has a Location then handle that as well
$refresh = $response->header("Location");
if (defined($refresh)) {
$refresh = makeurl($response->request->url, $refresh);
$request = HTTP::Request->new("GET", $refresh);
$response = $ua->request($request);
}
#print $response->as_string;
# if ($response->code==207 || $response->code==200 || $response->code==301) {
if ($response->code==207 || $response->code==200) {
$lasturl = $response->request->url;
# return $response->as_string;
return $response->content;
} else {
print "ERROR : Request \'$url\' Failed (".$response->status_line.")!\n";
print "Last URL : ".$response->request->url."\n";
return undef;
}
}
sub makeurl {
my $currenturl = shift;
my $newurl = shift;
if ($newurl =~ /^http[s]?:/) { #make sure that the newurl is not a absolute one
return $newurl;
} elsif ($newurl =~ /^\//) { #if the new url starts with a / then we ignore the current path and take the hostname only
$currenturl =~ m!([^:]+://[^/]+)!;
$currenturl = $1.$newurl;
return $currenturl;
} else { #if the new url doesnt start with a / then we discard the last page of current url and append the new url
$currenturl =~ m!([^:]+://.+/)!;
$currenturl = $1.$newurl;
return $currenturl;
}
}
|