#!/usr/bin/perl # # parse publications strings # use warnings; use strict; use File::Basename; use Data::Dumper; # use open IN => ":any", OUT=> ":utf8"; my $TITLE = 'title'; my $YEAR = 'year'; my $START_PAGE = 'start_page'; my $END_PAGE = 'end_page'; my $JOURNAL = 'journal'; my $TYPE = 'type'; my $AUTHORS = 'authors'; my $VOLUME = 'volume'; sub parse_pub2 ($) { my $string = shift @_; local $_; my %ret = (); # use re "debug"; # pos($string) = 0; # if ($string =~ m/^\d+\.([^:]+): ?((?:[^.]+\([^\)]+\)[^.?!]+|[^.?!]+)[.?!]) ?(\([\w.]+\))? (.+)$/i) { # if ($string =~ m/^([^:]+): (.+?[.?!]) (\(\w+\.?\) )?([A-Z](?=\w*[. ]).+)$/i) { # while ($string =~ m/\G^\d+\. ([^:]+): (.+?[.?!]) (\(\w+.?\) )?(?=[A-Z]\w+[. ])([A-Z].+)$/g) { while ($string =~ m/^\d+\. ([^:]+): (.+?[.?!]) (\(\w+.?\) )?(?=[A-Z]\w+[. ])([A-Z].+)$/g) { my $authors = $1; $ret{$TITLE} = $2; if ($4) { $ret{$TYPE} = $3; $_ = $4; } else { $_ = $3; } if (m/^([^:]+) ([\w()]+): (\d+)-(\d+), (\d+)\./) { $ret{$JOURNAL}=$1; $ret{$VOLUME}=$2; $ret{$START_PAGE}=$3; $ret{$END_PAGE}=$4; $ret{$YEAR}=$5; my @array = split (/ /,$ret{$JOURNAL}); # last if (10 > scalar(@{[split (/ /,$ret{$JOURNAL})]})); last if (scalar(@array) < 10); } else { $ret{$JOURNAL}=$_; last; } } return %ret; } my $omimf = shift @ARGV || "-"; open (INF,"$omimf") or die "Unable to open '$omimf': $!"; my $i = 1; # line number my $space = 0; # was last line space my $extra = ""; # some entries are in multiple lines while () { m/^#/ && next; chomp; s/\r$//; if (!$_) { $space = 1; } else { $space = 0; } if ($space && $extra) { chop ($extra); print "$extra\n"; my %pub = parse_pub2($extra); #print Dumper(%pub); if (defined($pub{$VOLUME}) && defined($pub{$YEAR}) && defined($pub{$START_PAGE}) && defined($pub{$JOURNAL})) { #print "${pub{$JOURNAL}}[JO] AND ${pub{$YEAR}}[DP] AND ", # "${pub{$VOLUME}}[VI] AND ${pub{$START_PAGE}}[PG]\n"; print "J:$pub{$JOURNAL}\n\n"; } else { #print "NOT PMID($i): "; #foreach (sort keys %pub) { printf "$_ -> %s,",defined($pub{$_}) ? $pub{$_} : "undef" } #print "\n"; } $extra = ""; } else { $extra .= "$_ "; } } #continue { printf ("\r%d", $i++); } print "\n"; continue { $i++ } exit;