#!/usr/bin/perl use strict; use warnings; use diagnostics; use Set::Scalar; use WWW::Mechanize; use URI; # URL for DBLP search my $dblpURL = 'http://www.informatik.uni-trier.de/~ley/db/indices/a-tree/'; # Delay time my ($delayMean, $delayVar) = (5, 3); # print headers of output print join("\t", ('name', 'DBLP entry', 'home page', 'email')), "\n"; # iterate over list of author's names my $counter = 0; while (<>) { # since one should be polite (or stealthy), sleep for a while sleep($delayMean + int($delayVar/2 - rand($delayVar))); $counter++; my @data; s/^[ ]*(.+?)[ \r\n]*$/$1/; my ($name, $dblpEntry, $homePage, $email) = split(/\t/, $_); print STDERR "now handling record $counter: '$name'\n"; if (!defined $homePage || $homePage eq '') { push(@data, $name); my $mech = WWW::Mechanize->new(); $mech->agent_alias('Windows IE 6'); # get DBLP search page $mech->get($dblpURL); # insert author's name in search page and submit $mech->form_number(1); $mech->field('author', $name); $mech->submit(); # if author has a DBLP entry, the resulting page will # have his name as title, if not, stop processing # author if ($mech->title() !~ /$name/) { print join("\t", (@data, 0, "", "")), "\n"; next; } push(@data, 1); # search for a link that has 'Home Page' as text and # follow it, stop processing if there is none if (!defined $mech->follow_link(text => "Home Page", n => 1) && !defined $mech->follow_link(text => "Home page", n => 1) && !defined $mech->follow_link(text => "home Page", n => 1) && !defined $mech->follow_link(text => "home page", n => 1) && !defined $mech->follow_link(text => "Homepage", n => 1) && !defined $mech->follow_link(text => "HomePage", n => 1) && !defined $mech->follow_link(text => "homePage", n => 1) && !defined $mech->follow_link(text => "homepage", n => 1)) { print join("\t", (@data, "", "")), "\n"; next; } push(@data, $mech->uri()); # retrieve all links on the author's home page and # output only those that are 'mailto' URLs my @links = map { URI->new($_->[0]) } $mech->links(); my $addresses = Set::Scalar->new(); foreach my $link (@links) { if (defined $link->scheme() && ($link->scheme() eq 'mailto') && !$addresses->contains($link->opaque())) { print join("\t", (@data, $link->opaque())), "\n"; $addresses->insert($link->opaque()); } } print join("\t", (@data, "")), "\n" if $addresses->size() == 0; } else { print join("\t", ($name, $dblpEntry, $homePage, $email)), "\n"; } }