Category: Information retrieval
Author/Contact Info Juerd
Description:

Two years ago, I complained that http://www.ns.nl/ didn't work with my user agent and wrote a screen scraping script. They recently updated their site, which now works perfectly with both Firefox and Konqueror, but now I had another reason for complaining: my script no longer worked!

Fortunately, in the past two years, WWW::Mechanize was made (a very good thing, as their new site uses sessions!) and the NS learned to write HTML, so I can use a parser. The code is almost the same size, but a lot better readable.

#!/usr/bin/perl -w
use strict;
use WWW::Mechanize;
use HTML::TreeBuilder;
use POSIX qw(strftime);
use Text::Table;

$| = 1;

our $VERSION = '2.00';

my $batch = $ENV{BATCH};
my $be = $batch ? "...\n" : "";
my $defaultfrom = 'Dordrecht';

sub mydie {
    print STDERR @_;
    exit 255;
}

$SIG{__DIE__} = sub {
    local $_ = "@_";
    /look_down/ and mydie "Unexpected HTML output.\n";
    mydie $_;
};


sub input {
    my ($prompt, $default, $pattern, $force) = @_;
    my ($input, @input);
    if (defined $force){
        $force = $default if $force eq '';
        unless (@input = $force =~ $pattern){
            mydie "'$force' is an invalid value for '$prompt'.\n";
        }
        return @input;
    }
    do{
        print "$prompt [$default]: ";
        $input = <STDIN>;
        mydie "Aborted.\n" if not defined $input;
        chomp $input;
        $input = $default if $input eq '';
    } until @input = $input =~ $pattern;
    return @input;
}

my @param = @ARGV && $ARGV[0] eq '-p' ? <STDIN> : @ARGV;
chomp @param;
if ((@param or $batch) and @param != 5) {
    print <<"    EOU";
    Dutch Railways train schedule information, version $VERSION.
    Made by Juerd <juerd\@juerd.nl>
    Use this at your own risk.

        Usage: $0
        Usage: echo -e "\$FROM\\n\$TO\\n\$DATE\\n\$TIME\\n\$DEPARTURE"
+ | $0 -p
        Usage: $0 \$TO \$FROM \$DATE \$TO \$DEPARTURE

        Example: $0 'Rotterdam Centaal' 'Den Helder' '' 17:00 A
        (Empty parameters take defaults, date/time default to 
current local.)
    EOU
    exit 1;
}

my ($from) = input('From', $defaultfrom, qr/^(.+)$/, $param[0]);
my ($to)   = input('To', '', qr/^(.+)$/, $param[1]);
my ($dd, $mm, $yyyy) =
    input(
        'Date (dd/mm/yyyy)',
        strftime('%d/%m/%Y', localtime),
        qr/^(3[01]|[012]?\d)\W(1[012]|0?\d)(?:\W((?:\d\d)?\d\d))?$/,
        $param[2]
    );
my ($hh, $MM) =
    input(
        'Time',
        strftime('%H:%M', localtime),
        qr/^(2[0-3]|[01]?\d)\W?([0-5]\d)$/,
        $param[3]
    );
my ($da) = input('(D)eparture or (A)rrival', 'D', qr/^([VDAvda01])/,$p
+aram[4]);
$da = $da =~ /[Aa1]/ ? 0 : 1;

$yyyy ||= 1900 + (localtime)[5];
$yyyy = "20$yyyy" if length($yyyy) == 2;

my $mech = WWW::Mechanize->new(onerror => \&mydie);

print "* Getting first page$be";
$mech->get('http://www.ns.nl/');
print ".\n" unless $batch;

if ($mech->content =~ /"0;URL=([^"]+)"/) {
    print "* Following META redirect$be";
    $mech->get($1);
    print ".\n" unless $batch;
}

print "* Submitting form$be";
$mech->submit_form(
    form_name => 'snelPlanner',
    fields => {
        vanStation => $from,
        naarStation => $to,
        reisdatumDag => $dd,
        reisdatumMaand => $mm,
        reisdatumJaar => $yyyy,
        reisdatumUur => $hh,
        reisdatumMinuut => $MM,
        reisdatumVertrekAankomst => ($da ? 'true' : 'false'),
    }
);
print ".\n" unless $batch;

if (my ($form) = grep { $_->attr('name') || '' eq 'thisForm' } $mech->
+forms) {
    print "* Fixing input: ";
    $mech->form_name('thisForm');
    for (qw/vanStation naarStation/) {
        my ($input) = $form->find_input($_);
        my @pv = $input->possible_values or next;
        my @vn = $input->value_names;
        my $i = 0;
        my %vn = map { $_ => $i++ } @vn;
        my ($name) = grep /centraal/i, @vn;
        my $value = defined $name ? $pv[$vn{$name}] : '';
        $value ||= $pv[0];
        print "selecting $vn[defined $name ? $vn{$name} : 0], ";
        $mech->field($_, $value);
    }
    print "submitting$be";
    $mech->submit;
    print ".\n" unless $batch;
}

my $content = $mech->content;

if ($content =~ /<!-- Error: (.*) -->/) {
    mydie "Error from server: $1\n";
}

print "\n";
print $content =~ /(Reisadvies van .*)/, "\n";
print $content =~ /((?:Vertrek|Aankomst): .*)/, "\n\n";

my $tree = HTML::TreeBuilder->new_from_content($content);

my $table = $tree->look_down(_tag => 'table', class => 'advice');
my @table = 
    grep "@$_" =~ /[^\s\240]/,
    map [ map $_->as_text, $_->look_down(_tag => qr/^t[hd]$/) ],
    $table->look_down(_tag => 'tr');

pop @table;

for (@table) {
    splice @$_, 2, 1;
    splice @$_, 0, 1;
    splice @$_, 1, 0, splice @$_, 2, 1;
}

my $tt = Text::Table->new(@{ shift @table });
$tt->load(@table);
print $tt->title, $tt->body, "\n";

$table = $tree->look_down(_tag => 'table', class => 'prices');
@table = 
    map [ map $_->as_text, $_->look_down(_tag => qr/^t[hd]$/) ],
    $table->look_down(_tag => 'tr');

splice @{ $table[0] }, 2, 0, '';
splice @{ $table[0] }, 4, 0, '';

$tt = Text::Table->new(@{ shift @table });
$tt->load(@table);
print $tt->title, $tt->body;
Replies are listed 'Best First'.
Re: Dutch Railways train schedule information 2.00
by Joost (Canon) on Jul 15, 2004 at 15:03 UTC

      Nice! I would like to see a range of departure-arrival times, though (as they do on the site). Sometimes a slightly earlier or later departure time will get you a much better (shorter) traveling time.

      Patches welcome! (I live next to the train station here, and always just want to know when the next one leaves.)

      Juerd # { site => 'juerd.nl', plp_site => 'plp.juerd.nl', do_not_use => 'spamtrap' }