sub is_ms_num {
# field separator, normally ' ', modified for slice comparison.
# see perldoc perlvar.
my $ofs = $"; $" = '';
my $string = shift;
my @test = split //, $string;
chomp($string);
$^W=0;
# check to see if we have the two letter designation
if ("@test[0,1]" !~ /[A-Za-z][A-Za-z]/) { return undef }
# the next two should be 00 or greater
if ("@test[2,3]" !~ /0[0-9]/) { return undef }
# the next four are the actual manuscript number, and should be
# numeric
if ("@test[4,5,6,7]" !~ /[0-9]{4}/) { return undef }
# char 8 is PR's checksum
# JACS is allowed to have an additional six chars, but nobody else...
if (($test[9]) and ("@test[0,1]" !~ /JA/)) { return undef }
# verify that the JACS additional chars are kosher
if ((defined $test[9]) and # it might very well be zero...
("@test[0,1]" =~ /JA/) and
("@test[9,10,11,12,13,14]" !~ /-[0-9]{2}-[0-9]{2}/)) { return undef }
$" = $ofs;
$^W=1;
# looks like its good, return it.
return $string;
}
####
sub atomize {
my $string = shift;
my @atoms = split / /, $string;
# the most we are going to combine is word -1, word, and word +1.
my @molecules;
my $ofs = $";
for ( my $i = 0; $i < $#atoms; $i++ ) {
$" = '';
my $molecule = "@atoms[$i-1,$i,$i+1]";
push @molecules, $molecule;
}
$" = $ofs;
return (\@atoms, \@molecules);
}
sub search_ms_nums {
my @words = (@_);
foreach my $word (@words) {
return $word if is_ms_num( $word );
}
return undef;
}
##
##
if ($header =~ /^[Ss]ubject/) {
my ($words, $strings) = atomize( $header );
$key = search_ms_nums(@{$words}) || search_ms_nums(@{$strings});
}
##
##
sub is_ms_num_re {
my $rval = shift;
if ($rval =~ /(?:IC|JA|CM|OL|OM)\d{6}[\w+-](?:-\d{,3}-\d{,3})?/) {
return $rval;
} else {
undef;
}
}