$_ = <<"..."; Leonard of Quirm, a character in the Discworld series of novels, is based largely on Leonardo da Vinci. Leonardo da Vinci died at Clos Lucé, France, on 2nd May, 1519. ... # Normalize the whitespace s/\s+/ /g; my $RX = qr/ # NODE EXPLANATION # ---------------------------------------------------------------------- \b # the boundary between a word char (\w) and # something that is not a word char # ---------------------------------------------------------------------- ( # group and capture to \1: # ---------------------------------------------------------------------- \w+ # word characters (a-z, A-Z, 0-9, _) (1 or # more times (matching the most amount # possible)) # ---------------------------------------------------------------------- (?: # group, but do not capture (0 or more # times (matching the most amount # possible)): # ---------------------------------------------------------------------- \s+ # whitespace (\n, \r, \t, \f, and " ") # (1 or more times (matching the most # amount possible)) # ---------------------------------------------------------------------- \w+ # word characters (a-z, A-Z, 0-9, _) (1 # or more times (matching the most # amount possible)) # ---------------------------------------------------------------------- )* # end of grouping # ---------------------------------------------------------------------- ) # end of \1 # ---------------------------------------------------------------------- \b # the boundary between a word char (\w) and # something that is not a word char # ---------------------------------------------------------------------- .+? # any character except \n (1 or more times # (matching the least amount possible)) # ---------------------------------------------------------------------- \b # the boundary between a word char (\w) and # something that is not a word char # ---------------------------------------------------------------------- \1 # what was matched by capture \1 # ---------------------------------------------------------------------- \b # the boundary between a word char (\w) and # something that is not a word char /xms; while ( /$RX/sg ) { pos() = $-[0] + 1; print "<$1>\n"; }