Re: MedlineParser: to parse and load MEDLINE into a RDBMS

Ah - someone else is interested in this sort of stuff, too!
Here is a perl program that queries the PubMed site and adds it to a BibTeX database. It would be simple enough to add a DBI backend. I also found the source code above turgid. Using better data structures, the following program I wrote several years ago weighs in at 427 lines. If I was rewriting it these days, I would probably add a Perl/Tk interface.
#!/usr/bin/perl -w
#
# mk_pm2bib.pl - save pubmed searches to a bibtex database
#
# This is a heavy reworking of a script originally written by Dave
# Blake and modified by Katrin Schenk.

use strict;
use Getopt::Std;
use Text::Wrap;
use vars qw|$opt_u $opt_a $store_abstract $ans $made_backup|;

getopts('au')  or die "bad options!";
# -u - display usage
# -a - store abstract

usage()    if $opt_u;
$store_abstract = 1 if $opt_a;
$made_backup = 0;

$Text::Wrap::columns = 74;
my $bib_entries = {};
my $bibtex_file = "";
if (($ans = get_action()) eq 'o') { # append to an existing database
   print "Which bibtex file would you like to add entries to? ";
   $bibtex_file = <STDIN>;
   chomp $bibtex_file;
   print "Reading in file $bibtex_file...";
   $bib_entries = parse_bibtex( $bibtex_file);
   print "done.\n";
}
elsif ($ans eq 'c') { # create a new database
   print "Name of the new bibtex file to add entries to? ";
   $bibtex_file = <STDIN>;
   chomp $bibtex_file;
}
else { # quit
   exit(0);
}
# main event loop: query, convert, add, store
while (1) {
   my ($q_response, $data) = query_pubmed();
   last if $q_response =~ /q/i;
   my ($c_response, $pubmed_entries) = convert_pubmed( $data);
   if (scalar keys %$pubmed_entries) {
      print "Merging chosen entries into database...";
      $bib_entries = add_entries( $bib_entries, $pubmed_entries);
      print "done.\nSaving database to $bibtex_file...";
      write_bibtex( $bibtex_file, $bib_entries);
      print "done.\n";
   }
   last if $c_response =~ /q/i;
}
exit(0);

### Subroutines

sub usage
{
   print STDOUT<<"EoF";

usage: mk_pm2bib.pl [-u] [-a]

   -u  display usage
   -a  store abtract, if any, in bibtex file

This script allows users to search the PubMed database, screen the
results, and add desired entries into a new or pre-existing bibtex
file.  When choosing to add entries to a pre-existing bibtex file this
script reads in this file and parses it such that duplicates are not
allowed and bibtex handles are unique.
EoF
   exit(0);
}

# Prompt user for mode of use

sub get_action
{
   my $append;

   do {
      print STDOUT <<"EoF";
What would you like to do?

         (o) Open an existing bibtex file.
         (c) Create a new bibtex file.
         (q))Quit.
EoF
      print "Choice: ";
      $append = <STDIN>;
   } while ($append !~ /([ocq])/);

   return $1;
}

# Read in and parse bibtex file. This parser would fail on a general
# bibtex file, but is good enough for our machine generated entries.
# For entries with identical keys, only last one seen is saved.

sub parse_bibtex
{
   my $bibtex_file = shift;
   my $parse = {};
   my $key = "initial";

   open( BIBTEX, "<$bibtex_file")
      or die "Can't open $bibtex_file for reading: $!\n";

   while (<BIBTEX>) {
      next if /^\s*$/;
      if (/^\@preamble\s*([({]?)(.*?)([)}]?)\s*$/i) { # @preamble
         my $open  = $1;
         my $value = defined $2 ? $2 : "";
         my $close = defined $3 ? $3 : "";
         $parse->{preamble} = $value;
         until (   ($open eq '(' && $close eq ')')
                || ($open eq '{' && $close eq '}'))
         {
            my $line = <BIBTEX>;
            die "Syntax error, runaway preamble: $parse->{preamble}"
               if $line =~ /^\@/;
            $line =~ /^(.*?)([)}]?)\s*$/;
            $value = defined $1 ? $1 : "";
            $close = defined $2 ? $2 : "";
            $parse->{preamble} .= "\n$value";
         }
      }
      elsif (/^\@string\s*([({])\s*(\w+)\s*=\s*(["{]?)(.*?)(["}]?)([)}
+])\s*$/i)
{ # assume a single line string entry
         my $field = $2;
         my $value = defined $4 ? $4 : "";
         $parse->{strings}{$field} = $value;
      }
      elsif (/^\@comment\s*\{(.*?)(\}?)\s*$/i) { # @comment{...}
         my $value = defined $1 ? $1 : "";
         my $close = defined $2 ? $2 : "";
         $parse->{comment}{$key} = $value;
         until ($close eq '}') {
            my $line = <BIBTEX>;
            die "Syntax error, runaway comment: $parse->{comment}{$key
+}"
               if $line =~ /^\@/;
            $line =~ /^(.*?)(\}?)\s*$/;
            $value = defined $1 ? $1 : "";
            $close = defined $2 ? $2 : "";
            $parse->{comment}{$key} .= "\n$value";
         }
      }
      elsif (/^\@(\w+)\s*\{\s*(.*),$/) { # start of regular entry
         $key = $2;
         delete $parse->{$key} if $parse->{$key}{entry_type};
         $parse->{$key}{entry_type} = $1;
      }
      elsif (/^\s*(\w+)\s*=\s*(["{]?)(.*?)(["}]?,?)\s*$/) { # field = 
+value
         my $field = $1;
         my $open  = defined $2 ? $2 : "";
         my $value = defined $3 ? $3 : "";
         my $close = defined $4 ? $4 : "";
         $parse->{$key}{$field} = $value;
         until (   ($open eq '"' && $close =~ /^",?$/)
                || ($open eq '{' && $close =~ /^\},?$/)
                || ($open eq ''  && $close =~ /^,?$/) )
         {
            my $line = <BIBTEX>;
            die "Syntax error, runaway field: $field = $parse->{$key}{
+$field}"
               if $line =~ /^\@/;
            $line =~ /^\s*(.*?)(["}]?,?)\s*$/;
            $value = defined $1 ? $1 : "";
            $close = defined $2 ? $2 : "";
            $parse->{$key}{$field} .= " $value";
         }
         $parse->{$key}{$field} =~ s/^\s*(.+?)\s*$/$1/s;
         $parse->{$key}{$field} =~ s/\s+/ /gs;
      }
      elsif (/^\s*\}\s*$/) { # end of entry
         # do nothing
      }
      else {  # safety check
         die "Could not parse the line\n   $_";
      }
   }
   close BIBTEX;
   return $parse;
}

# query Pubmed and return raw output.

sub query_pubmed
{
   my $search_string = 'wget -O - "http://www.ncbi.nlm.nih.gov:80/entr
+ez/' .
      'query.fcgi?cmd=Search&db=PubMed&dispmax=100&doptcmdl=MEDLINE&te
+rm=';
   my $terms = "";

   print STDOUT<<"EoF";

ENTER SEARCH CRITERIA: A search string is just search terms separated
by spaces. Authors are formatted like "doupe aj". Keyword search is by
default in the whole entry. To search for a keyword in just a title,
qualify it with the location in brackets, e.g., spectrum[TITLE]. Note
that "and" is implictly assumed between search terms.

    Example 1:  Search for papers by both A. J. Doupe and T. Troyer

                Enter: doupe aj troyer t

    Example 2: Search for papers by both A. J. Doupe and T. Troyer wit
+h
               the "birdsong" somewhere in the abstract and/or title.

               Enter: doupe aj troyer t birdsong

    Example 3: Search for papers with "zebra" in the title only.

               Enter: zebra[TI]

You may also quit by entering a single 'q' and carraige return.
EoF
   # Read in search terms and convert spaces to URL encoding
   do {
      print "Search: ";
      $terms = <STDIN>;
   } while ($terms =~ /^\s*$/);

   return ( "q", "") if $terms =~ /^q$/i;
   $terms =~ s/^\s*(.+?)\s*$/$1/s; # trim space
   $terms =~ s/\s+/\%20/g;         # encode spaces

   # Add terms to search string, ignore stderr output
   $search_string .= "$terms \" 2>/dev/null";

   # Execute wget command and capture output
   print "Getting info from Pubmed...";
   my @output = `$search_string`;
   print "done.\n";

   return ( "yes", \@output);
}

# convert raw pubmed output to bibtex format. %map shows what to extra
+ct.

sub convert_pubmed
{
   my $data = shift;
   my $entries = {};
   my $i = 0;   # line index
   my $pm = {}; # pubmed entry info
   my %map = (AB => "abstract", TI => "title", AU => "author", TA => "
+journal",
              VI => "volume", IP => "number", DP => "year", PG => "pag
+es");
   my $choices = join '|', keys %map;
   my $answer = "";
   my $key;

   # extract entries from raw text
 MAINLOOP:
   while ($i < @$data) {
      next unless $data->[$i++] =~ /^PMID-\s*(\d+)\s*$/; # start of en
+try
      my $pmid = $1;
      while ($i < @$data) {
         if ($data->[$i] =~ /^($choices)\s*-\s*(.+?)\s*$/) {
            my $type = $map{$1};
            if ($type eq "author") { # could have multiple authors
               push @{$pm->{$pmid}{author}}, $2;
               $i++;
            }
            else { # single field, possibly multiple lines
               $pm->{$pmid}{$type} = $2;
               while ($data->[++$i] =~ /^\s+( .+)$/) {
                  $pm->{$pmid}{$type} .= $1;
               }
            }
         }
         elsif ($data->[$i] =~ /^SO/) { # end of entry
            $answer = entry_wanted( $pm, $pmid);
            delete $pm->{$pmid}, last MAINLOOP if $answer =~ /q|n/;
            delete $pm->{$pmid} unless $answer == 1;
            $i++;
            last;
         }
         else {
            $i++;
         }
      }
   }

   # convert the selected entries to bibtex format
   foreach my $pmid (keys %$pm) {
      # create a bibtex key
      my $auth = $pm->{$pmid}{author};
      $pm->{$pmid}{year} =~ /((19|20)\d\d)/;
      my $year = $1;
      if ( @{$auth} == 1) { # author0:year
         $auth->[0] =~ /^(\w+)/;
         $key = lc "$1:$year";
      }
      elsif ( @{$auth} == 2) { # author0_author1:year
         $auth->[0] =~ /^(\w+)/;
         $key = lc "$1_";
         $auth->[1] =~ /^(\w+)/;
         $key .= lc "$1:$year";
      }
      else { # author0_etal:year
         $auth->[0] =~ /^(\w+)/;
         $key = lc "$1_etal:$year";
      }
      # populate bibtex entry - assume an article XXX
      $entries->{$key}{entry_type} = "article";
      @$auth = map {my ($sur, $ini) = /^(.+?)\s+(\w+)$/;
                    $ini =~ s/(\w)/$1./g; "$ini $sur";  }  @$auth;
      $entries->{$key}{author} = join " and ", @$auth;
      $entries->{$key}{author} =~ s/^\s*(.+?)\s*$/$1/s;
      $entries->{$key}{author} =~ s/\s+/ /gs;
      foreach my $field (values %map) {
         next if $field eq "author";
         next unless exists $pm->{$pmid}{$field};
         my $value = $pm->{$pmid}{$field};
         $value =~ s/&quot;/"/g;  # decode HTML escapes
         $value =~ s/&gt;/>/g;
         $value =~ s/&lt;/</g;
         $value =~ s/^\s*(.+?)\s*$/$1/s;  # remove extra space
         $value =~ s/\s+/ /gs;
         $value =~ s/\.$//s if $field eq "title";
         $entries->{$key}{$field} = $value;
      }
   }
   return ($answer, $entries);
}

# Does user want to add entry, skip, new search, quit?

sub entry_wanted
{
   my ($pm, $pmid) = @_;
   my $authors = join "  ", @{$pm->{$pmid}{author}};
   my $answer = "";

   do {
      print "\n";
      print wrap( "", " " x 10, "Authors: $authors ($pm->{$pmid}{year}
+).\n");
      print wrap( "", " " x 8,  "Title: $pm->{$pmid}{title}\n");
      print wrap( "", " " x 12,  "Reference: $pm->{$pmid}{journal} ($p
+m->{$pmid}
{volume}):$pm->{$pmid}{pages}.\n");
      print wrap( "", " " x 10, "Abstract: $pm->{$pmid}{abstract}\n")
         if exists $pm->{$pmid}{abstract};
      print STDOUT<<"EoF2";

Now what?
   (a) Aadd entry to bibtex database
   (s) Skip entry.
   (n) New search.
   (q) Quit.
EoF2
      print "Choice: ";
      $answer = <STDIN>;
   } while ($answer !~ /[asqn]/i);

   return $answer if $answer =~ /q|n/i;
   return 1       if $answer =~ /a/i;
   return 0;
}

# merge new entries into existing bib database

sub add_entries
{
   my ($old, $new) = @_;

 new_key:
   foreach my $new_key (keys %$new) {
      my @close_keys = grep {/^$new_key[a-z]?$/} keys %$old;
      unless (@close_keys) { # no collision, just add it
         $old->{$new_key} = $new->{$new_key};
      }
      else { # collision, see if it is a duplicate
      close_key:
         foreach my $c_key ( @close_keys) {
            foreach my $field (keys %{$old->{$c_key}}) {
               next close_key unless exists $new->{$new_key}{$field}
                  && $new->{$new_key}{$field} eq $old->{$c_key}{$field
+};
            }
            next new_key; # found a duplicate, skip it
         } # all @close_keys entries are different, add new key
         my $uniq_key = $new_key . chr(ord('a') + @close_keys - 1);
         $old->{$uniq_key} = $new->{$new_key};
      }
   }
   return $old;
}

# Parse bibtex format and write out to file.

sub write_bibtex
{
   my ($bibtex_file, $bib) = @_;
   my @format = qw|author title journal volume number pages year|;
   push @format, "abstract" if $store_abstract;

   rename $bibtex_file, "$bibtex_file.bak" unless $made_backup;
   $made_backup = 1;

   unless (open BIBOUT, ">$bibtex_file") {
      use File::Copy; # be conservative
      copy( "$bibtex_file.bak", $bibtex_file);
      die "Could not open $bibtex_file for writing: $!";
   }
   print BIBOUT "\@comment\{$bib->{comment}{initial}\}\n\n"
      if exists $bib->{comment}{initial};
   print BIBOUT "\@preamble\{$bib->{preamble}\}\n\n"
      if exists $bib->{preamble};
   if (exists $bib->{strings}) {
      foreach my $field (keys %{$bib->{strings}}) {
         print BIBOUT "\@string\{$field = \"$bib->{strings}{$field}\"\
+}\n\n";
      }
   }
   foreach my $key (sort keys %$bib) {
      next if $key =~ /preamble|strings|comment/;  # treated elsewhere
      print BIBOUT "\@$bib->{$key}{entry_type}\{ $key,\n";
      foreach my $field (@format) {
         next unless exists $bib->{$key}{$field};
         my $text = $bib->{$key}{$field};
         $text = "$field = " .($text =~ /^\d+$/ ? "$text," : "\"$text\
+",");
         print BIBOUT wrap( "   ", " " x (7 + length $field), "$text\n
+");
      }
      print BIBOUT "\}\n\n";
      print BIBOUT "\@comment\{$bib->{comment}{$key}\}\n\n"
         if exists $bib->{comment}{$key};
   }
   close BIBOUT;
}
[download]
-Mark
Comment on Re: MedlineParser: to parse and load MEDLINE into a RDBMS Download Code