in reply to MedlineParser: to parse and load MEDLINE into a RDBMS

Ah - someone else is interested in this sort of stuff, too!

Here is a perl program that queries the PubMed site and adds it to a BibTeX database. It would be simple enough to add a DBI backend. I also found the source code above turgid. Using better data structures, the following program I wrote several years ago weighs in at 427 lines. If I was rewriting it these days, I would probably add a Perl/Tk interface.

#!/usr/bin/perl -w # # mk_pm2bib.pl - save pubmed searches to a bibtex database # # This is a heavy reworking of a script originally written by Dave # Blake and modified by Katrin Schenk. use strict; use Getopt::Std; use Text::Wrap; use vars qw|$opt_u $opt_a $store_abstract $ans $made_backup|; getopts('au') or die "bad options!"; # -u - display usage # -a - store abstract usage() if $opt_u; $store_abstract = 1 if $opt_a; $made_backup = 0; $Text::Wrap::columns = 74; my $bib_entries = {}; my $bibtex_file = ""; if (($ans = get_action()) eq 'o') { # append to an existing database print "Which bibtex file would you like to add entries to? "; $bibtex_file = <STDIN>; chomp $bibtex_file; print "Reading in file $bibtex_file..."; $bib_entries = parse_bibtex( $bibtex_file); print "done.\n"; } elsif ($ans eq 'c') { # create a new database print "Name of the new bibtex file to add entries to? "; $bibtex_file = <STDIN>; chomp $bibtex_file; } else { # quit exit(0); } # main event loop: query, convert, add, store while (1) { my ($q_response, $data) = query_pubmed(); last if $q_response =~ /q/i; my ($c_response, $pubmed_entries) = convert_pubmed( $data); if (scalar keys %$pubmed_entries) { print "Merging chosen entries into database..."; $bib_entries = add_entries( $bib_entries, $pubmed_entries); print "done.\nSaving database to $bibtex_file..."; write_bibtex( $bibtex_file, $bib_entries); print "done.\n"; } last if $c_response =~ /q/i; } exit(0); ### Subroutines sub usage { print STDOUT<<"EoF"; usage: mk_pm2bib.pl [-u] [-a] -u display usage -a store abtract, if any, in bibtex file This script allows users to search the PubMed database, screen the results, and add desired entries into a new or pre-existing bibtex file. When choosing to add entries to a pre-existing bibtex file this script reads in this file and parses it such that duplicates are not allowed and bibtex handles are unique. EoF exit(0); } # Prompt user for mode of use sub get_action { my $append; do { print STDOUT <<"EoF"; What would you like to do? (o) Open an existing bibtex file. (c) Create a new bibtex file. (q))Quit. EoF print "Choice: "; $append = <STDIN>; } while ($append !~ /([ocq])/); return $1; } # Read in and parse bibtex file. This parser would fail on a general # bibtex file, but is good enough for our machine generated entries. # For entries with identical keys, only last one seen is saved. sub parse_bibtex { my $bibtex_file = shift; my $parse = {}; my $key = "initial"; open( BIBTEX, "<$bibtex_file") or die "Can't open $bibtex_file for reading: $!\n"; while (<BIBTEX>) { next if /^\s*$/; if (/^\@preamble\s*([({]?)(.*?)([)}]?)\s*$/i) { # @preamble my $open = $1; my $value = defined $2 ? $2 : ""; my $close = defined $3 ? $3 : ""; $parse->{preamble} = $value; until ( ($open eq '(' && $close eq ')') || ($open eq '{' && $close eq '}')) { my $line = <BIBTEX>; die "Syntax error, runaway preamble: $parse->{preamble}" if $line =~ /^\@/; $line =~ /^(.*?)([)}]?)\s*$/; $value = defined $1 ? $1 : ""; $close = defined $2 ? $2 : ""; $parse->{preamble} .= "\n$value"; } } elsif (/^\@string\s*([({])\s*(\w+)\s*=\s*(["{]?)(.*?)(["}]?)([)} +])\s*$/i) { # assume a single line string entry my $field = $2; my $value = defined $4 ? $4 : ""; $parse->{strings}{$field} = $value; } elsif (/^\@comment\s*\{(.*?)(\}?)\s*$/i) { # @comment{...} my $value = defined $1 ? $1 : ""; my $close = defined $2 ? $2 : ""; $parse->{comment}{$key} = $value; until ($close eq '}') { my $line = <BIBTEX>; die "Syntax error, runaway comment: $parse->{comment}{$key +}" if $line =~ /^\@/; $line =~ /^(.*?)(\}?)\s*$/; $value = defined $1 ? $1 : ""; $close = defined $2 ? $2 : ""; $parse->{comment}{$key} .= "\n$value"; } } elsif (/^\@(\w+)\s*\{\s*(.*),$/) { # start of regular entry $key = $2; delete $parse->{$key} if $parse->{$key}{entry_type}; $parse->{$key}{entry_type} = $1; } elsif (/^\s*(\w+)\s*=\s*(["{]?)(.*?)(["}]?,?)\s*$/) { # field = +value my $field = $1; my $open = defined $2 ? $2 : ""; my $value = defined $3 ? $3 : ""; my $close = defined $4 ? $4 : ""; $parse->{$key}{$field} = $value; until ( ($open eq '"' && $close =~ /^",?$/) || ($open eq '{' && $close =~ /^\},?$/) || ($open eq '' && $close =~ /^,?$/) ) { my $line = <BIBTEX>; die "Syntax error, runaway field: $field = $parse->{$key}{ +$field}" if $line =~ /^\@/; $line =~ /^\s*(.*?)(["}]?,?)\s*$/; $value = defined $1 ? $1 : ""; $close = defined $2 ? $2 : ""; $parse->{$key}{$field} .= " $value"; } $parse->{$key}{$field} =~ s/^\s*(.+?)\s*$/$1/s; $parse->{$key}{$field} =~ s/\s+/ /gs; } elsif (/^\s*\}\s*$/) { # end of entry # do nothing } else { # safety check die "Could not parse the line\n $_"; } } close BIBTEX; return $parse; } # query Pubmed and return raw output. sub query_pubmed { my $search_string = 'wget -O - "http://www.ncbi.nlm.nih.gov:80/entr +ez/' . 'query.fcgi?cmd=Search&db=PubMed&dispmax=100&doptcmdl=MEDLINE&te +rm='; my $terms = ""; print STDOUT<<"EoF"; ENTER SEARCH CRITERIA: A search string is just search terms separated by spaces. Authors are formatted like "doupe aj". Keyword search is by default in the whole entry. To search for a keyword in just a title, qualify it with the location in brackets, e.g., spectrum[TITLE]. Note that "and" is implictly assumed between search terms. Example 1: Search for papers by both A. J. Doupe and T. Troyer Enter: doupe aj troyer t Example 2: Search for papers by both A. J. Doupe and T. Troyer wit +h the "birdsong" somewhere in the abstract and/or title. Enter: doupe aj troyer t birdsong Example 3: Search for papers with "zebra" in the title only. Enter: zebra[TI] You may also quit by entering a single 'q' and carraige return. EoF # Read in search terms and convert spaces to URL encoding do { print "Search: "; $terms = <STDIN>; } while ($terms =~ /^\s*$/); return ( "q", "") if $terms =~ /^q$/i; $terms =~ s/^\s*(.+?)\s*$/$1/s; # trim space $terms =~ s/\s+/\%20/g; # encode spaces # Add terms to search string, ignore stderr output $search_string .= "$terms \" 2>/dev/null"; # Execute wget command and capture output print "Getting info from Pubmed..."; my @output = `$search_string`; print "done.\n"; return ( "yes", \@output); } # convert raw pubmed output to bibtex format. %map shows what to extra +ct. sub convert_pubmed { my $data = shift; my $entries = {}; my $i = 0; # line index my $pm = {}; # pubmed entry info my %map = (AB => "abstract", TI => "title", AU => "author", TA => " +journal", VI => "volume", IP => "number", DP => "year", PG => "pag +es"); my $choices = join '|', keys %map; my $answer = ""; my $key; # extract entries from raw text MAINLOOP: while ($i < @$data) { next unless $data->[$i++] =~ /^PMID-\s*(\d+)\s*$/; # start of en +try my $pmid = $1; while ($i < @$data) { if ($data->[$i] =~ /^($choices)\s*-\s*(.+?)\s*$/) { my $type = $map{$1}; if ($type eq "author") { # could have multiple authors push @{$pm->{$pmid}{author}}, $2; $i++; } else { # single field, possibly multiple lines $pm->{$pmid}{$type} = $2; while ($data->[++$i] =~ /^\s+( .+)$/) { $pm->{$pmid}{$type} .= $1; } } } elsif ($data->[$i] =~ /^SO/) { # end of entry $answer = entry_wanted( $pm, $pmid); delete $pm->{$pmid}, last MAINLOOP if $answer =~ /q|n/; delete $pm->{$pmid} unless $answer == 1; $i++; last; } else { $i++; } } } # convert the selected entries to bibtex format foreach my $pmid (keys %$pm) { # create a bibtex key my $auth = $pm->{$pmid}{author}; $pm->{$pmid}{year} =~ /((19|20)\d\d)/; my $year = $1; if ( @{$auth} == 1) { # author0:year $auth->[0] =~ /^(\w+)/; $key = lc "$1:$year"; } elsif ( @{$auth} == 2) { # author0_author1:year $auth->[0] =~ /^(\w+)/; $key = lc "$1_"; $auth->[1] =~ /^(\w+)/; $key .= lc "$1:$year"; } else { # author0_etal:year $auth->[0] =~ /^(\w+)/; $key = lc "$1_etal:$year"; } # populate bibtex entry - assume an article XXX $entries->{$key}{entry_type} = "article"; @$auth = map {my ($sur, $ini) = /^(.+?)\s+(\w+)$/; $ini =~ s/(\w)/$1./g; "$ini $sur"; } @$auth; $entries->{$key}{author} = join " and ", @$auth; $entries->{$key}{author} =~ s/^\s*(.+?)\s*$/$1/s; $entries->{$key}{author} =~ s/\s+/ /gs; foreach my $field (values %map) { next if $field eq "author"; next unless exists $pm->{$pmid}{$field}; my $value = $pm->{$pmid}{$field}; $value =~ s/&quot;/"/g; # decode HTML escapes $value =~ s/&gt;/>/g; $value =~ s/&lt;/</g; $value =~ s/^\s*(.+?)\s*$/$1/s; # remove extra space $value =~ s/\s+/ /gs; $value =~ s/\.$//s if $field eq "title"; $entries->{$key}{$field} = $value; } } return ($answer, $entries); } # Does user want to add entry, skip, new search, quit? sub entry_wanted { my ($pm, $pmid) = @_; my $authors = join " ", @{$pm->{$pmid}{author}}; my $answer = ""; do { print "\n"; print wrap( "", " " x 10, "Authors: $authors ($pm->{$pmid}{year} +).\n"); print wrap( "", " " x 8, "Title: $pm->{$pmid}{title}\n"); print wrap( "", " " x 12, "Reference: $pm->{$pmid}{journal} ($p +m->{$pmid} {volume}):$pm->{$pmid}{pages}.\n"); print wrap( "", " " x 10, "Abstract: $pm->{$pmid}{abstract}\n") if exists $pm->{$pmid}{abstract}; print STDOUT<<"EoF2"; Now what? (a) Aadd entry to bibtex database (s) Skip entry. (n) New search. (q) Quit. EoF2 print "Choice: "; $answer = <STDIN>; } while ($answer !~ /[asqn]/i); return $answer if $answer =~ /q|n/i; return 1 if $answer =~ /a/i; return 0; } # merge new entries into existing bib database sub add_entries { my ($old, $new) = @_; new_key: foreach my $new_key (keys %$new) { my @close_keys = grep {/^$new_key[a-z]?$/} keys %$old; unless (@close_keys) { # no collision, just add it $old->{$new_key} = $new->{$new_key}; } else { # collision, see if it is a duplicate close_key: foreach my $c_key ( @close_keys) { foreach my $field (keys %{$old->{$c_key}}) { next close_key unless exists $new->{$new_key}{$field} && $new->{$new_key}{$field} eq $old->{$c_key}{$field +}; } next new_key; # found a duplicate, skip it } # all @close_keys entries are different, add new key my $uniq_key = $new_key . chr(ord('a') + @close_keys - 1); $old->{$uniq_key} = $new->{$new_key}; } } return $old; } # Parse bibtex format and write out to file. sub write_bibtex { my ($bibtex_file, $bib) = @_; my @format = qw|author title journal volume number pages year|; push @format, "abstract" if $store_abstract; rename $bibtex_file, "$bibtex_file.bak" unless $made_backup; $made_backup = 1; unless (open BIBOUT, ">$bibtex_file") { use File::Copy; # be conservative copy( "$bibtex_file.bak", $bibtex_file); die "Could not open $bibtex_file for writing: $!"; } print BIBOUT "\@comment\{$bib->{comment}{initial}\}\n\n" if exists $bib->{comment}{initial}; print BIBOUT "\@preamble\{$bib->{preamble}\}\n\n" if exists $bib->{preamble}; if (exists $bib->{strings}) { foreach my $field (keys %{$bib->{strings}}) { print BIBOUT "\@string\{$field = \"$bib->{strings}{$field}\"\ +}\n\n"; } } foreach my $key (sort keys %$bib) { next if $key =~ /preamble|strings|comment/; # treated elsewhere print BIBOUT "\@$bib->{$key}{entry_type}\{ $key,\n"; foreach my $field (@format) { next unless exists $bib->{$key}{$field}; my $text = $bib->{$key}{$field}; $text = "$field = " .($text =~ /^\d+$/ ? "$text," : "\"$text\ +","); print BIBOUT wrap( " ", " " x (7 + length $field), "$text\n +"); } print BIBOUT "\}\n\n"; print BIBOUT "\@comment\{$bib->{comment}{$key}\}\n\n" if exists $bib->{comment}{$key}; } close BIBOUT; }

-Mark