| Category: | Information retrieval |
| Author/Contact Info | Juerd |
| Description: | Two years ago, I complained that http://www.ns.nl/ didn't work with my user agent and wrote a screen scraping script. They recently updated their site, which now works perfectly with both Firefox and Konqueror, but now I had another reason for complaining: my script no longer worked! Fortunately, in the past two years, WWW::Mechanize was made (a very good thing, as their new site uses sessions!) and the NS learned to write HTML, so I can use a parser. The code is almost the same size, but a lot better readable. |
#!/usr/bin/perl -w
use strict;
use WWW::Mechanize;
use HTML::TreeBuilder;
use POSIX qw(strftime);
use Text::Table;
$| = 1;
our $VERSION = '2.00';
my $batch = $ENV{BATCH};
my $be = $batch ? "...\n" : "";
my $defaultfrom = 'Dordrecht';
sub mydie {
print STDERR @_;
exit 255;
}
$SIG{__DIE__} = sub {
local $_ = "@_";
/look_down/ and mydie "Unexpected HTML output.\n";
mydie $_;
};
sub input {
my ($prompt, $default, $pattern, $force) = @_;
my ($input, @input);
if (defined $force){
$force = $default if $force eq '';
unless (@input = $force =~ $pattern){
mydie "'$force' is an invalid value for '$prompt'.\n";
}
return @input;
}
do{
print "$prompt [$default]: ";
$input = <STDIN>;
mydie "Aborted.\n" if not defined $input;
chomp $input;
$input = $default if $input eq '';
} until @input = $input =~ $pattern;
return @input;
}
my @param = @ARGV && $ARGV[0] eq '-p' ? <STDIN> : @ARGV;
chomp @param;
if ((@param or $batch) and @param != 5) {
print <<" EOU";
Dutch Railways train schedule information, version $VERSION.
Made by Juerd <juerd\@juerd.nl>
Use this at your own risk.
Usage: $0
Usage: echo -e "\$FROM\\n\$TO\\n\$DATE\\n\$TIME\\n\$DEPARTURE"
+ | $0 -p
Usage: $0 \$TO \$FROM \$DATE \$TO \$DEPARTURE
Example: $0 'Rotterdam Centaal' 'Den Helder' '' 17:00 A
(Empty parameters take defaults, date/time default to
current local.)
EOU
exit 1;
}
my ($from) = input('From', $defaultfrom, qr/^(.+)$/, $param[0]);
my ($to) = input('To', '', qr/^(.+)$/, $param[1]);
my ($dd, $mm, $yyyy) =
input(
'Date (dd/mm/yyyy)',
strftime('%d/%m/%Y', localtime),
qr/^(3[01]|[012]?\d)\W(1[012]|0?\d)(?:\W((?:\d\d)?\d\d))?$/,
$param[2]
);
my ($hh, $MM) =
input(
'Time',
strftime('%H:%M', localtime),
qr/^(2[0-3]|[01]?\d)\W?([0-5]\d)$/,
$param[3]
);
my ($da) = input('(D)eparture or (A)rrival', 'D', qr/^([VDAvda01])/,$p
+aram[4]);
$da = $da =~ /[Aa1]/ ? 0 : 1;
$yyyy ||= 1900 + (localtime)[5];
$yyyy = "20$yyyy" if length($yyyy) == 2;
my $mech = WWW::Mechanize->new(onerror => \&mydie);
print "* Getting first page$be";
$mech->get('http://www.ns.nl/');
print ".\n" unless $batch;
if ($mech->content =~ /"0;URL=([^"]+)"/) {
print "* Following META redirect$be";
$mech->get($1);
print ".\n" unless $batch;
}
print "* Submitting form$be";
$mech->submit_form(
form_name => 'snelPlanner',
fields => {
vanStation => $from,
naarStation => $to,
reisdatumDag => $dd,
reisdatumMaand => $mm,
reisdatumJaar => $yyyy,
reisdatumUur => $hh,
reisdatumMinuut => $MM,
reisdatumVertrekAankomst => ($da ? 'true' : 'false'),
}
);
print ".\n" unless $batch;
if (my ($form) = grep { $_->attr('name') || '' eq 'thisForm' } $mech->
+forms) {
print "* Fixing input: ";
$mech->form_name('thisForm');
for (qw/vanStation naarStation/) {
my ($input) = $form->find_input($_);
my @pv = $input->possible_values or next;
my @vn = $input->value_names;
my $i = 0;
my %vn = map { $_ => $i++ } @vn;
my ($name) = grep /centraal/i, @vn;
my $value = defined $name ? $pv[$vn{$name}] : '';
$value ||= $pv[0];
print "selecting $vn[defined $name ? $vn{$name} : 0], ";
$mech->field($_, $value);
}
print "submitting$be";
$mech->submit;
print ".\n" unless $batch;
}
my $content = $mech->content;
if ($content =~ /<!-- Error: (.*) -->/) {
mydie "Error from server: $1\n";
}
print "\n";
print $content =~ /(Reisadvies van .*)/, "\n";
print $content =~ /((?:Vertrek|Aankomst): .*)/, "\n\n";
my $tree = HTML::TreeBuilder->new_from_content($content);
my $table = $tree->look_down(_tag => 'table', class => 'advice');
my @table =
grep "@$_" =~ /[^\s\240]/,
map [ map $_->as_text, $_->look_down(_tag => qr/^t[hd]$/) ],
$table->look_down(_tag => 'tr');
pop @table;
for (@table) {
splice @$_, 2, 1;
splice @$_, 0, 1;
splice @$_, 1, 0, splice @$_, 2, 1;
}
my $tt = Text::Table->new(@{ shift @table });
$tt->load(@table);
print $tt->title, $tt->body, "\n";
$table = $tree->look_down(_tag => 'table', class => 'prices');
@table =
map [ map $_->as_text, $_->look_down(_tag => qr/^t[hd]$/) ],
$table->look_down(_tag => 'tr');
splice @{ $table[0] }, 2, 0, '';
splice @{ $table[0] }, 4, 0, '';
$tt = Text::Table->new(@{ shift @table });
$tt->load(@table);
print $tt->title, $tt->body;
|
|
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Dutch Railways train schedule information 2.00
by Joost (Canon) on Jul 15, 2004 at 15:03 UTC | |
by Juerd (Abbot) on Jul 15, 2004 at 15:24 UTC |