110. Wunder, E.; Burghardt, U.; Lang, B.; Hamilton, L.: Fanconi's anemia: anomaly of enzyme passage through the nuclear membrane? Anomalous intracellular distribution of topoisomerase activity in placental extracts in a case of Fanconi's anemia. Hum. Genet. 58: 149-155, 1981.And I'm trying to seperate the journal name, which in this case is 'Hum. Genet.'.
#!/usr/bin/perl # # parse publications strings # use warnings; use strict; use File::Basename; use Data::Dumper; # use open IN => ":any", OUT=> ":utf8"; my $TITLE = 'title'; my $YEAR = 'year'; my $START_PAGE = 'start_page'; my $END_PAGE = 'end_page'; my $JOURNAL = 'journal'; my $TYPE = 'type'; my $AUTHORS = 'authors'; my $VOLUME = 'volume'; sub parse_pub2 ($) { my $string = shift @_; local $_; my %ret = (); # use re "debug"; # pos($string) = 0; # if ($string =~ m/^\d+\.([^:]+): ?((?:[^.]+\([^\)]+\)[^.?!]+|[^.?!] ++)[.?!]) ?(\([\w.]+\))? (.+)$/i) { # if ($string =~ m/^([^:]+): (.+?[.?!]) (\(\w+\.?\) )?([A-Z](?=\w*[. + ]).+)$/i) { # while ($string =~ m/\G^\d+\. ([^:]+): (.+?[.?!]) (\(\w+.?\) )?(?=[ +A-Z]\w+[. ])([A-Z].+)$/g) { while ($string =~ m/^\d+\. ([^:]+): (.+?[.?!]) (\(\w+.?\) )?(?=[A-Z] +\w+[. ])([A-Z].+)$/g) { my $authors = $1; $ret{$TITLE} = $2; if ($4) { $ret{$TYPE} = $3; $_ = $4; } else { $_ = $3; } if (m/^([^:]+) ([\w()]+): (\d+)-(\d+), (\d+)\./) { $ret{$JOURNAL}=$1; $ret{$VOLUME}=$2; $ret{$START_PAGE}=$3; $ret{$END_PAGE}=$4; $ret{$YEAR}=$5; my @array = split (/ /,$ret{$JOURNAL}); # last if (10 > scalar(@{[split (/ /,$ret{$JOURNAL})]})); last if (scalar(@array) < 10); } else { $ret{$JOURNAL}=$_; last; } } return %ret; } my $omimf = shift @ARGV || "-"; open (INF,"$omimf") or die "Unable to open '$omimf': $!"; my $i = 1; # line number my $space = 0; # was last line space my $extra = ""; # some entries are in multiple lines while (<INF>) { m/^#/ && next; chomp; s/\r$//; if (!$_) { $space = 1; } else { $space = 0; } if ($space && $extra) { chop ($extra); print "$extra\n"; my %pub = parse_pub2($extra); #print Dumper(%pub); if (defined($pub{$VOLUME}) && defined($pub{$YEAR}) && defined($pub{$START_PAGE}) && defined($pub{$JOURNAL})) { #print "${pub{$JOURNAL}}[JO] AND ${pub{$YEAR}}[DP] AND ", # "${pub{$VOLUME}}[VI] AND ${pub{$START_PAGE}}[PG]\n"; print "J:$pub{$JOURNAL}\n\n"; } else { #print "NOT PMID($i): "; #foreach (sort keys %pub) { printf "$_ -> %s,",defined($pub{$_}) + ? $pub{$_} : "undef" } #print "\n"; } $extra = ""; } else { $extra .= "$_ "; } } #continue { printf ("\r%d", $i++); } print "\n"; continue { $i++ } exit;
#!/usr/bin/perl # # parse publications strings # use warnings; use strict; use Data::Dumper; my $TITLE = 'title'; my $YEAR = 'year'; my $START_PAGE = 'start_page'; my $END_PAGE = 'end_page'; my $JOURNAL = 'journal'; my $TYPE = 'type'; my $AUTHORS = 'authors'; my $VOLUME = 'volume'; sub parse_pub ($) { my $string = shift @_; local $_; my %ret = (); # pos($string) = 0; # if ($string =~ m/^\d+\.([^:]+): ?((?:[^.]+\([^\)]+\)[^.?!]+|[^.?!] ++)[.?!]) ?(\([\w.]+\))? (.+)$/i) { # if ($string =~ m/^([^:]+): (.+?[.?!]) (\(\w+\.?\) )?([A-Z](?=\w*[. + ]).+)$/i) { # while ($string =~ m/\G^\d+\. ([^:]+): (.+?[.?!]) (\(\w+.?\) )?(?=[ +A-Z]\w+[. ])([A-Z].+)$/g) { while ($string =~ m/^\d+\. ([^:]+): (.+?[.?!]) (\(\w+.?\) )?(?=[A-Z] +\w+[. ])([A-Z].+)$/g) { my $authors = $1; $ret{$TITLE} = $2; if ($4) { $ret{$TYPE} = $3; $_ = $4; } else { $_ = $3; } if (m/^([^:]+) ([\w()]+): (\d+)-(\d+), (\d+)\./) { $ret{$JOURNAL}=$1; $ret{$VOLUME}=$2; $ret{$START_PAGE}=$3; $ret{$END_PAGE}=$4; $ret{$YEAR}=$5; my @array = split (/ /,$ret{$JOURNAL}); # last if (10 > scalar(@{[split (/ /,$ret{$JOURNAL})]})); last if (scalar(@array) < 10); } else { last; } } return %ret; } my $line = "110. Wunder, E.; Burghardt, U.; Lang, B.; Hamilton, L.: Fa +nconi's anemia: anomaly of enzyme passage through the nuclear membran +e? Anomalous intracellular distribution of topoisomerase activity in +placental extracts in a case of Fanconi's anemia. Hum. Genet. 58: 149 +-155, 1981."; print "$line\n"; my %pub = parse_pub($line); #print Dumper(%pub); print "J:$pub{$JOURNAL}\n\n"; exit;
What I would want it to print out is110. Wunder, E.; Burghardt, U.; Lang, B.; Hamilton, L.: Fanconi's anem +ia: anomaly of enzyme passage through the nuclear membrane? Anomalous + intracellular distribution of topoisomerase activity in placental ex +tracts in a case of Fanconi's anemia. Hum. Genet. 58: 149-155, 1981. J:Anomalous intracellular distribution of topoisomerase activity in pl +acental extracts in a case of Fanconi's anemia. Hum. Genet.
Now obviously as I've set it to find shortest string ending with one of '.!?' the match is correct. However later I test if the journal name (comes from second pattern match) is longer than 10 words. If it is 10 or more, I would like it to continue from where it left off and try to find more. I tried with while(m//g) {}, but no luck. Also using \G I failed. So any idea how to do this?110. Wunder, E.; Burghardt, U.; Lang, B.; Hamilton, L.: Fanconi's anem +ia: anomaly of enzyme passage through the nuclear membrane? Anomalous + intracellular distribution of topoisomerase activity in placental ex +tracts in a case of Fanconi's anemia. Hum. Genet. 58: 149-155, 1981. J:Hum. Genet.
In reply to Enforcing growth of regex by Hena
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |