#!/usr/bin/perl -w use strict; # First, we will allow no digits anywhere in a name; # this will allow us to detect the extension after the # name. Second, we only allow single spaces in a name # (and can't start or end with a space). Third, names # must contain a comma (but not in front). my $name= qr{ ( [^\s\d] (?: \s?[^\d\s]+ )* ) , ( (?: \s?[^\d\s]+ )* ) }x; # Org can only have single spaces and no commas, but is optional: my $org= qr{ (?: [^\s,](?:\s?[^\s,]+)*[^\s,] )? }x; # Heading must have "-" on each end and just one word between: my $head= qr{ -\s+[a-z]+\s+- }ix; my $entry= qr{ \s* (?: $head | $name \s+ (\d+) \s+ (\S*) \s+ ($org) ) }x; #print "$entry\n"; while( ) { my @matches= m/^$entry$entry\s*$/; #print "$_"; for( [0..4], [5..9] ) { my( $last, $first, $ext, $room, $org )= map { defined $_ ? $_ : "" } @matches[@$_]; if( "" ne $last ) { print "($last), ($first) ($ext) ($room) ($org)\n"; } } } __END__ NAME EXT RM# ORG NAME EXT RM# ORG ------------------------------------- -------------------------------------- - A - BASILE, YYYY 5555 1H08 IAMG ABEND, YYYYYY 5555 2014 CE BATES, YYYY 5555 4832 BT ABRAMS, YYYYY 5555 C-07 BATHERSFIELD, YY 5555 B-39 CE ADAMS, YYYY 5555 255 OTC BAXTER, YYYY 5555 A-43 ADAMS, YYYY 5555 149 BT BEAR, YYYYYY 5555 H42 ATO ADAMS, YYYYYYY 5555 A-16 BEASLEY, YYY 5555 D-79 ADUAKA, YYYYYYYY 5555 A-52 BEATTY, YY 5555 4832 TAG AHMED, YYYYYY 5555 C-63 BECHTLE, YYYY 5555 D-26 AHMED, C. YYYYYY 5555 D-69 SOMEU BEDOYA, YYYYYYYY 5555 CE