#!/usr/bin/perl -w use strict; use warnings; use diagnostics; use WWW::Mechanize; use HTML::TreeBuilder; my ($propid, $address, $price, $header, $summary); my ($status, $description, $agent); my ($officephone, $officefax, $proptype, $bed, $bath); my ($land, $carnumport, $carnumgar, $municipality); my ($title); #my $mech = WWW::Mechanize->new( autocheck => 1 ); #my $url = "http://www.realestate.com.au/cgi-bin/rsearch?a=o&id=106023887"; #$mech->get($url); # Use local copy of page my $file_copy = $ENV{HOME}."/rsearch.html"; #my $root = HTML::TreeBuilder->new_from_content($mech->{content}); my $root = HTML::TreeBuilder->new_from_file($file_copy); my $titleTag = $root->look_down('_tag', 'title'); $title = $titleTag ? $titleTag->as_trimmed_text() : ""; my $propertyDetail = $root->look_down('_tag', 'div', 'id', 'propertyDetail'); die("Cannot find propertyDetail") if !defined $propertyDetail; my $detailsHeader = $propertyDetail->look_down('_tag', 'div', 'class', qr{\bdetailsHeader\b}); die("Cannot find detailsHeader") if !defined $detailsHeader; my $propertyID = $detailsHeader->look_down('_tag', 'p', 'class', qr{\bpropertyID\b}); die("Cannot find propertyID") if !defined $propertyID; $propid = $propertyID->as_trimmed_text(); my $addrH1 = $detailsHeader->look_down('_tag', 'h1'); die("Cannot find addrH1") if !defined $addrH1; $address = $addrH1->as_trimmed_text(); my $priceStrong = $detailsHeader->look_down('_tag', 'strong', 'class', qr{\bprice\b}); die("Cannot find priceStrong") if !defined $priceStrong; $price = $priceStrong->as_trimmed_text(); my $textualDiv = $propertyDetail->look_down('_tag', 'div', 'class', qr{\btextual\b}); die("Cannot find textualDiv") if !defined $textualDiv; my $sizeH2 = $textualDiv->look_down('_tag', 'h2'); die("Cannot find sizeH2") if !defined $sizeH2; $header = $sizeH2->as_trimmed_text(); my $propSumH2 = $textualDiv->look_down('_tag', 'h2', 'class', qr{\bpropertySummary\b}); die("Cannot find propSumH2") if !defined $propSumH2; $summary = $propSumH2->as_trimmed_text(); # contract/offer might be missing my $offerH3 = $textualDiv->look_down('_tag', 'h3', 'class', qr{\bhighlighted\b}); $status = $offerH3 ? $offerH3->as_trimmed_text() : ""; my $descriptionDiv = $textualDiv->look_down('_tag', 'div', 'class', qr{\bdescription\b}); die("Cannot find descriptionDiv") if !defined $descriptionDiv; $description = $descriptionDiv->as_trimmed_text(); my $propertySummary = $propertyDetail->look_down('_tag', 'div', 'id', 'propertySummary'); die("Cannot find propertySummary") if !defined $propertySummary; # Property summary in a definition list, no id my $propdl = $propertySummary->look_down('_tag', 'dl'); die("Cannot find propdl") if !defined $propdl; # Collect all dt/dd in definition list my @propdtdd = $propdl->look_down('_tag', qr{\b(?:dt|dd)\b}); die("Empty property details list") if scalar(@propdtdd) == 0; my %prop; my $lastdt = ""; foreach (@propdtdd) { next if !ref; my $thingy = $_; if ($thingy->attr('_tag') eq "dt") { $lastdt = $thingy->as_trimmed_text(); $lastdt =~ s/\s*:$//; # strip trailing colon } elsif ($thingy->attr('_tag') eq "dd" && $lastdt) { $prop{lc($lastdt)} = $thingy->as_trimmed_text(); $lastdt = ""; } } $proptype = $prop{category} || ""; $bed = $prop{bedrooms} || ""; $bath = $prop{bathrooms} || ""; $land = $prop{land} || ""; $carnumport = $prop{carport} || ""; $carnumgar = $prop{garage} || ""; $municipality = $prop{municipality} || ""; my $contactAgentDetails = $root->look_down('_tag', 'div', 'id', 'contactAgentDetails'); die("Cannot find contactAgentDetails") if !defined $contactAgentDetails; # First paragraph has name my $firstp = $contactAgentDetails->look_down('_tag', 'p'); die("Cannot find firstp") if !defined $firstp; $agent = $firstp->as_trimmed_text(); my $officePhone = $contactAgentDetails->look_down('_tag', 'p', 'class', qr{\bofficePhone\b}); die("Cannot find officePhone") if !defined $officePhone; $officephone = $officePhone->as_trimmed_text(); my $officeFax = $contactAgentDetails->look_down('_tag', 'p', 'class', qr{\bofficeFax\b}); die("Cannot find officeFax") if !defined $officeFax; $officefax = $officeFax->as_trimmed_text(); $root->delete(); print_all(); sub print_all { print "PropertyID: $propid \n"; print "Status: $status \n"; print "Title: $title \n"; print "Address: $address \n"; print "Price: $price \n"; print "Type: $proptype \n"; print "Bedrooms: $bed \n"; print "Bathrooms: $bath \n"; print "Carport: $carnumport \n"; print "Garage: $carnumgar \n"; print "Header: $header \n"; print "Summary: $summary \n"; print "Description: $description \n"; print "Agent Details: \n"; print "Agent: $agent \n"; print "OfficePhone: $officephone \n"; print "OfficeFax: $officefax \n"; }