#!/usr/bin/perl

# This script scrapes co-author info for a specific person from DBLP.
# Revision 1 intended use is to supply author name on cmd line,
#   with a list of co-authors provided as output, one per line.
# Revision 2 methodizes the crawler to provide multi-lvl crawling.
# Revision 3 encapsulates the crawler in a loop to run against a list of authors.
# Status output is written to STDOUT, errors (such as unfound authors) written
#   to STDERR, and coauthor info is written to the file specified below.

# Import Perl's WWW library for quick & easy web retrieval.
#    utf8 allows unicode char in this script, and also import HTML unicode conversion methods.
use utf8;
use LWP::Simple;
use HTML::Entities;

# Inits; $sleep indicates time to wait between each author crawl.
my %conflicts = ();
my $base_url  = 'http://www.sigmod.org/dblp/db/indices/a-tree/';
my $sleep     = 5;
my $outfile   = 'conflicts.txt';

# DBLP full names for some authors otherwise not found in catalog.
my %fullnames = ('List of names snipped');

# Open the input data, parsing out reviewer names into a list.  Init output file.
open FILE, 'PapersReviewers.csv';
my $line = shift || die "Empty input file.";
close FILE;
chomp $line;
my (undef, @reviewers) = split ',', $line;
my $num_reviewers = $#reviewers + 1;

# Open output file...  use > to start over, >> to continue.
open OUTFILE, '>>'.$outfile;
close OUTFILE;

# Disable buffering on STDOUT so I can see the damn progress log in realtime.
select((select(STDOUT), $|=1)[0]);

# Loop over all reviewers, formatting name and calling the Crawler.
my $count = 0;
foreach my $reviewer (sort @reviewers)
{
  # Loop inits; clear the conflicts hash.
  $count++;
  next if ($count < 0); # Skip to current guy (or gal).
   %conflicts = ();
   print 'Working on ', $reviewer, ', # ', $count, ' of ', $num_reviewers, '...  ';

   # Use alternative DBLP name if it exists.
   if (exists $fullnames{$reviewer})
   {
      $reviewer = $fullnames{$reviewer};
      print 'using alternative form of name ', $reviewer, '... ';
   }

   # Format reviewer name to match DBLP specs.
   my $orig_name = $reviewer;
   $reviewer = encode_entities($reviewer);
   $reviewer =~ s/[^\w\s]/=/g;
   my ($first, $middle, $last) = split /\s+/, $reviewer;
   my $formatted = '';
   if (defined $last)
   {  $formatted = $last.':'.$first.'_'.$middle;  }
   else
   {  $formatted = $middle.':'.$first;  }
   
   # Call the crawler method with formatted name.
   #&Crawl('Fox:Edward_A=');
   &Crawl($formatted);

   # Results of 1st level crawl are crawled in turn (2nd lvl).
   my @conflicts = sort keys %conflicts;
   foreach my $conflict (@conflicts)
   {  &Crawl($conflict);  }

   # Output the results.
   open OUTFILE, '>>'.$outfile;
  print OUTFILE $orig_name, ',';
  foreach my $key (sort keys %conflicts)
  {  print OUTFILE $conflicts{$key}, ',';  }
  print OUTFILE "\n";
  close OUTFILE;

  # Finished with this $reviewer, wait $sleep seconds before starting next.
  print 'done.', "\n";
  sleep $sleep;
}

# Returns a list of co-authors from DBLP.
sub Crawl
{
  # Compose author name for retrieval.
  my $name    = shift || die "Bad usage of method Crawl.";
  my $category = lc(substr($name, 0, 1));

  # Construct author URL and retrieve summary page.
  my $url      = $base_url.$category.'/'.$name.'.html';
  my $page     = get($url) || warn "Couldn't get ${url}: $!";
  return () unless defined $page;

  # Find co-authors list at bottom and parse out all names & URLs.
  while ($page =~ m{\d+<\/td>    # First two lines match style code.
                  <td\salign="right"\sbgcolor="[^"]+">
                  <a\shref="([^"]+)">            # Matches relative link to coauthor page.
                  ([^>]+)<\/a>                      # Matches co-author name.
                  }mgx)
  {
     # Translate relative URL into an absolute using base address of DBLP.
     my $url         = $1;
     my $coauth_name = $2;
     my ($tmp1, $tmp2, $tmp3) = split '/', $url;
     my $coauth = $tmp3;
     $coauth =~ s/.html$//;
     $coauth_name = decode_entities($coauth_name);     

     # Save this co-author.
     $conflicts{$coauth} = $coauth_name;
  }

  return 0;
}