#!/usr/bin/perl
use LWP::Simple;
use HTML::TokeParser::Simple;
use Data::Dumper;
use strict;
use warnings;
my $start = 'http://155.69.224.75:8000/eeepeople/AcadStaff.asp';
my $file = './index.html';
LWP::Simple::mirror($start, $file);
my $p = HTML::TokeParser::Simple->new($file);
my $state = 0;
my ($url, $name, @teachers );
while (my $t = $p->get_token) {
if ($state == 0) {
if ($t->is_start_tag('a')) {
my $attr = $t->return_attr;
if (exists $attr->{href} and $attr->{href} =~ /\/cv\//) {
$url = $attr->{href};
$state = 1;
}
}
}
elsif ($state == 1) {
if ($t->is_end_tag('a')) {
push @teachers, {
name => $name,
url => $url
};
$name = '';
$url = '';
$state = 0;
}
elsif ($t->is_text) {
$name .= $t->as_is;
}
}
}
print Dumper(\@teachers), "\n";
foreach my $teacher (@teachers) {
my $filename = lc($teacher->{name});
$filename =~ s/\s+/_/g;
$filename .= '.html';
LWP::Simple::mirror($teacher->{url}, $filename);
my $p = HTML::TokeParser::Simple->new($filename);
$state = 0;
my ($pub, $res, @publications, @interests);
while (my $t = $p->get_token) {
if ($state == 0) {
if ($t->is_text and $t->as_is =~ /publication/i) {
$state = 2;
}
}
elsif ($state == 2) {
if ($t->is_start_tag('li')) {
$state = 3;
}
elsif ($t->is_end_tag('ul')) {
$state = 0;
}
}
elsif ($state == 3) {
if ($t->is_text) {
$pub .= $t->as_is;
}
elsif ($t->is_end_tag('li')) {
push @publications, $pub;
$pub = '';
$state = 2;
}
}
}
print $teacher->{name} ." published:\n";
print Dumper(\@publications), "\n";
}