sub is_ms_num { # field separator, normally ' ', modified for slice comparison. # see perldoc perlvar. my $ofs = $"; $" = ''; my $string = shift; my @test = split //, $string; chomp($string); $^W=0; # check to see if we have the two letter designation if ("@test[0,1]" !~ /[A-Za-z][A-Za-z]/) { return undef } # the next two should be 00 or greater if ("@test[2,3]" !~ /0[0-9]/) { return undef } # the next four are the actual manuscript number, and should be # numeric if ("@test[4,5,6,7]" !~ /[0-9]{4}/) { return undef } # char 8 is PR's checksum # JACS is allowed to have an additional six chars, but nobody else... if (($test[9]) and ("@test[0,1]" !~ /JA/)) { return undef } # verify that the JACS additional chars are kosher if ((defined $test[9]) and # it might very well be zero... ("@test[0,1]" =~ /JA/) and ("@test[9,10,11,12,13,14]" !~ /-[0-9]{2}-[0-9]{2}/)) { return undef } $" = $ofs; $^W=1; # looks like its good, return it. return $string; } ##

##

sub atomize {
  my $string = shift;

  my @atoms = split / /, $string;

  # the most we are going to combine is word -1, word, and word +1.

  my @molecules;

  my $ofs = $";
  for ( my $i = 0; $i < $#atoms; $i++ ) {
    $" = '';
    my $molecule = "@atoms[$i-1,$i,$i+1]";
    push @molecules, $molecule;
  }

  $" = $ofs;
  return (\@atoms, \@molecules);
}

sub search_ms_nums {
  my @words = (@_);

  foreach my $word (@words) {
    return $word if is_ms_num( $word );
  }
  return undef;
}

##

##

      if ($header =~ /^[Ss]ubject/) {
        my ($words, $strings) = atomize( $header );
        $key = search_ms_nums(@{$words}) || search_ms_nums(@{$strings});
      }

##

##

sub is_ms_num_re {
  my $rval = shift;
  if ($rval =~ /(?:IC|JA|CM|OL|OM)\d{6}[\w+-](?:-\d{,3}-\d{,3})?/) {
    return $rval;
  } else {
    undef;
  }
}