#!/usr/bin/perl use strict; use warnings; use open ':locale'; # tell perl to use the $LANG environment encoding for STDOUT/IN/ERR # check 'man 3 open' for details; but this is quite important use HTML::Entities; # used to decode HTML &; entities use Encode qw(decode); # used to decode utf8/iso into perls internal representation (which is utf8) my $data = { 'href' => 'http://www.accountancyage.com/accountancyage/news/2159769/kpmg-sets-retail-think-tank', 'teaser' => '
AccountancyAge.com, Accountancy Age, Thursday 6 July 2006 at 00:00:00
Firm forms partnership with retail research group
KPMG has launched the âRetail Think Tankâ (RTT) aimed at establishing âthe true health and status\'Â of the retail sector. The Big Four firm has joined forces with retail research group...
', 'title' => "KPMG sets up retail \x{e2}\x{80}\x{98}think tank\x{e2}\x{80}\x{99}", }; my $html = $data->{teaser}; decode_entities($html); # because it's html, we need to do this first $html = decode('utf8',$html); # now 'parse' the utf8 my $title = $data->{title}; # this is 'raw' utf8; the \x{e2} sequences indicate this $title = decode('utf8',$title); # so just parse it print "** $title:\n"; print "$html\n"; ####
#!/usr/bin/perl
use warnings;
use strict;
use HTML::TreeBuilder;
use Data::Dumper;
my $tree = HTML::TreeBuilder->new_from_file('IDQ60606.shtml');
my @cellnames = qw(
station time temperature dewpoint
relhumidity deltat
wind_dir speedkmh gustkmh speedknt gistknt
pressure rain
);
my $region;
my %data;
for my $row ($tree->look_down('_tag'=>'tr')) {
my @cells = $row->look_down('_tag'=>'td');
print scalar @cells, "\n";
if(@cells==1) {
$region = $cells[0]->as_trimmed_text;
}
if(@cells == @cellnames) {
my %row;
@row{@cellnames} = map { $_->as_trimmed_text} @cells;
push @{$data{$region}} => \%row;
}
}
print "$_\n" for keys %data;
print Dumper \%data;
while (my ($region,$data) = each %data) {
my @data = @$data;
my $raintotal;
for (@data) {
my $rain = $_->{rain};
$rain = 0 if $rain eq '-';
$raintotal+=$rain;
}
my $rainaverage = @data ? ($raintotal / @data) : undef;
print "$region: $rainaverage\n";
}
##
##
#!/usr/bin/perl
use warnings;
use strict;
use HTML::TreeBuilder;
my $tree = HTML::TreeBuilder->new_from_file('IDQ60606.shtml');
my @cells = $tree->look_down(
'_tag' => 'td',
'class' => 'rowlevel1',
);
print $_->as_trimmed_text,"\n" for @cells;
##
##
PENINSULA
GULF COUNTRY
NORTHERN GOLDFIELDS and UPPER FLINDERS
NORTH TROPICAL COAST and TABLELANDS
HERBERT and LOWER BURDEKIN
CENTRAL COAST - WHITSUNDAYS
CAPRICORNIA
CENTRAL HIGHLANDS - COALFIELDS
CENTRAL WEST
NORTHWEST
CHANNEL COUNTRY
MARANOA and WARREGO
DARLING DOWNS and GRANITE BELT
WIDE BAY and BURNETT
SOUTHEAST COAST
CORAL SEA
##
##
# All directories in our parent's path
%dirs = map { /^.*\/(.*)/ => $_ } grep {-d} glob "../*";
##
##
# De-crapper (for use after Word HTML idiocy)
my $file = join '',<>;
$file =~ s///gms;
$file =~ s///gms;
$file =~ s/<\/?o:.*?>//gms;
$file =~ s///gms;
$file =~ s/
//gms;
$file =~ s///gms;
$file =~ s/<\/?span.*?>//gms;
print $file;
####
# Password generator
my @chars = ('.','!','#','@','$','/',0..9,'A'..'Z','a'..'z');
my $length = 8 + rand 4;
my $pw = join '', @chars[ map { rand @chars } (1..$length)];
##
##
# Java namestyle to SQL namestyle regex
s/(?##
# Environment dumper
while (my @set = each %ENV) {printf "%s=>%s\n",@set}