use warnings; use strict; my $rx_comma = qr{ \s* , \s* }xms; my $rx_date = qr{ \d{4} - \d\d - \d\d }xms; my $rx_name = qr{ [[:alpha:]] (?: '? [[:alpha:]]+)? }xms; my $rx_hyphenate = qr{ - $rx_name }xms; my $rx_surname = qr{ $rx_name $rx_hyphenate? }xms; my $rx_initial = qr{ [[:alpha:]] \. }xms; my $rx_givenname = qr{ $rx_initial | $rx_surname }xms; my $rx_prof = qr{ $rx_surname (?: $rx_comma (?: \s* $rx_givenname )+ )? }xms; # avoid polluting namespace with a bunch of common variable names. my $rx_facility = do { my $room = qr{ \d{3,4} [[:alpha:]]? }xms; my $range = qr{ $room (?: \s* - \s* $room)? }xms; my $rooms = qr{ $range (?: $rx_comma $range)* }xms; my $function = qr{ \( [^)]+ \) }xms; qr{ $rooms \s* $function }xms; # final regex }; $/ = ""; # paragrep mode while (my $entry = ) { my ($date, $prof, @facilities) = $entry =~ m{ $rx_date | $rx_prof | $rx_facility }xmsg; print <## >perl regex_parse_fields_1.pl date: '2006-01-01' prof: 'O'Reilly,Watson B.' facilities: '406,560(centrifuge,refrig.)' '569b,607(dark room)' '210-211,101(ultracentrifuge)' '104-105(crystal growth rooms)' '660(centrifuge, refrig.)' date: '2007-02-02' prof: 'Olsen, Alfa-Betty Z.' facilities: '102a-102c, 104(media lab)' '101(writer's lounge)' date: '2008-03-04' prof: 'Peebles, P.J.E.' facilities: '1000a - 9999z (physical cosmology lab.)' '000-001 (computational cosmology lab)'