#!/usr/bin/perl use strict; use warnings; my $data = "Antler embedded in mound at South Street, Avebury, Wiltshire, England. Comment (lab): Collagen fraction used"; my @result; while ($data =~ /\b([A-Z][a-z]*)/g) { push @result, $1; } print join(' ', @result), "\n"; #### The regular expression: (?-imsx:\b([A-Z][a-z]*)) matches as follows: NODE EXPLANATION ---------------------------------------------------------------------- (?-imsx: group, but do not capture (case-sensitive) (with ^ and $ matching normally) (with . not matching \n) (matching whitespace and # normally): ---------------------------------------------------------------------- \b the boundary between a word char (\w) and something that is not a word char ---------------------------------------------------------------------- ( group and capture to \1: ---------------------------------------------------------------------- [A-Z] any character of: 'A' to 'Z' ---------------------------------------------------------------------- [a-z]* any character of: 'a' to 'z' (0 or more times (matching the most amount possible)) ---------------------------------------------------------------------- ) end of \1 ---------------------------------------------------------------------- ) end of grouping ----------------------------------------------------------------------