#!/usr/bin/perl # This script scrapes co-author info for a specific person from DBLP. # Revision 1 intended use is to supply author name on cmd line, # with a list of co-authors provided as output, one per line. # Revision 2 methodizes the crawler to provide multi-lvl crawling. # Revision 3 encapsulates the crawler in a loop to run against a list of authors. # Status output is written to STDOUT, errors (such as unfound authors) written # to STDERR, and coauthor info is written to the file specified below. # Import Perl's WWW library for quick & easy web retrieval. # utf8 allows unicode char in this script, and also import HTML unicode conversion methods. use utf8; use LWP::Simple; use HTML::Entities; # Inits; $sleep indicates time to wait between each author crawl. my %conflicts = (); my $base_url = 'http://www.sigmod.org/dblp/db/indices/a-tree/'; my $sleep = 5; my $outfile = 'conflicts.txt'; # DBLP full names for some authors otherwise not found in catalog. my %fullnames = ('List of names snipped'); # Open the input data, parsing out reviewer names into a list. Init output file. open FILE, 'PapersReviewers.csv'; my $line = shift || die "Empty input file."; close FILE; chomp $line; my (undef, @reviewers) = split ',', $line; my $num_reviewers = $#reviewers + 1; # Open output file... use > to start over, >> to continue. open OUTFILE, '>>'.$outfile; close OUTFILE; # Disable buffering on STDOUT so I can see the damn progress log in realtime. select((select(STDOUT), $|=1)[0]); # Loop over all reviewers, formatting name and calling the Crawler. my $count = 0; foreach my $reviewer (sort @reviewers) { # Loop inits; clear the conflicts hash. $count++; next if ($count < 0); # Skip to current guy (or gal). %conflicts = (); print 'Working on ', $reviewer, ', # ', $count, ' of ', $num_reviewers, '... '; # Use alternative DBLP name if it exists. if (exists $fullnames{$reviewer}) { $reviewer = $fullnames{$reviewer}; print 'using alternative form of name ', $reviewer, '... '; } # Format reviewer name to match DBLP specs. my $orig_name = $reviewer; $reviewer = encode_entities($reviewer); $reviewer =~ s/[^\w\s]/=/g; my ($first, $middle, $last) = split /\s+/, $reviewer; my $formatted = ''; if (defined $last) { $formatted = $last.':'.$first.'_'.$middle; } else { $formatted = $middle.':'.$first; } # Call the crawler method with formatted name. #&Crawl('Fox:Edward_A='); &Crawl($formatted); # Results of 1st level crawl are crawled in turn (2nd lvl). my @conflicts = sort keys %conflicts; foreach my $conflict (@conflicts) { &Crawl($conflict); } # Output the results. open OUTFILE, '>>'.$outfile; print OUTFILE $orig_name, ','; foreach my $key (sort keys %conflicts) { print OUTFILE $conflicts{$key}, ','; } print OUTFILE "\n"; close OUTFILE; # Finished with this $reviewer, wait $sleep seconds before starting next. print 'done.', "\n"; sleep $sleep; } # Returns a list of co-authors from DBLP. sub Crawl { # Compose author name for retrieval. my $name = shift || die "Bad usage of method Crawl."; my $category = lc(substr($name, 0, 1)); # Construct author URL and retrieve summary page. my $url = $base_url.$category.'/'.$name.'.html'; my $page = get($url) || warn "Couldn't get ${url}: $!"; return () unless defined $page; # Find co-authors list at bottom and parse out all names & URLs. while ($page =~ m{\d+<\/td> # First two lines match style code.