in reply to Help formatting text to delimited text in file
Here's another approach. The module is extremely simple: no exportation, no OO. Invocation must be by fully-qualified function names, but that's part of the simplicity.
The idea is that the regexes used for record field validation and extraction can be adapted very precisely to the data. (These regexes are currently rather general and naive; names and titles and such can be very tricky.) No effort is made to check for play index number/name duplication. The test script should be expanded to cover many edge cases. Note that the "Polonius" output field of play 46005 has been "fixed": The trailing space present in the OP has been removed; if this whitespace needs to remain, you need to specify under what conditions trailing whitespace is preserved.
Test file ParsePlaybill.t:# ParsePlaybill.pm 20apr19waw package ParsePlaybill; use warnings; use strict; # use Data::Dump qw(dd); # for debug # parsing regexes. # general record format. my $rx_intro = qr{ \A \^ }xms; # start of record my $rx_outro = qr{ \s* \+ \Z }xms; # end of record # index number field and separator from title field. my $rx_index = qr{ \b \d+ \b }xms; # index number my $rx_index_sep = qr{ \s* % \s* }xms; # index number separator # general identifier regex. used extensively: may or may not be # appropriate for all dependent patterns (play titles, role titles, # actor names, etc). my $rx_ident = qr{ [[:alpha:]] [-'.[:alnum:]]* }xms; # play title field and separator from first role. my $rx_title = qr{ \b $rx_ident (?: \s+ $rx_ident)* \b }xms; my $rx_title_sep = qr{ \s* \Q.|\E \s* }xms; # role character name and name of actor in the role. my $rx_char = qr{ \b $rx_ident (?: \s+ $rx_ident)* \b }xms; my $rx_player = qr{ \b $rx_ident (?: \s+ $rx_ident)* \b }xms; # role character/actor compound field. my $rx_role_sep = qr{ \s* - \s* }xms; my $rx_role_term = qr{ \s* ; \s* }xms; my $rx_role = qr{ $rx_char $rx_role_sep $rx_player $rx_role_term }xms; # record parsing and formatting functions. sub rec { my ($record, ) = @_; my $parsed = my ($n_index, $title, $role_list) = $record =~ m{ $rx_intro ($rx_index) $rx_index_sep # must have index number ($rx_title) $rx_title_sep # must have play title ($rx_role+) # must have at least one role $rx_outro }xms; die "bad record '$record'" unless $parsed; my @roles; # character/actor fields as array refs. push @roles, [ $1, $2 ] while $role_list =~ m{ ($rx_char) $rx_role_sep ($rx_player) }xmsg; return $n_index, $title, @roles; } sub fmt { my ($n_index, $title, @roles, # array of array refs.: character/actor pairs ) = @_; my $head = qq{"$n_index","$title"}; # put header on each character/actor role pair. @roles = map qq{$head,"$_->[0]","$_->[1]"}, @roles; # terminate every headed role with a newline. return join "\n", @roles, ''; } 1; # flag successful module inclusion
# ParsePlaybill.t 20apr19waw use warnings; use strict; use Test::More 'no_plan'; use Test::NoWarnings; # use Data::Dump qw(dd); # for debug BEGIN { use_ok 'ParsePlaybill'; } my @Tests = ( [ "^46004 % Tamerlane.| Tamerlane - Sheridan; Bajazet - Barry; Mones +es - A Gentleman; Arpasia - Mrs. Furnival; Selima - Mrs. Elmy; +\n", <<'EOT', "46004","Tamerlane","Tamerlane","Sheridan" "46004","Tamerlane","Bajazet","Barry" "46004","Tamerlane","Moneses","A Gentleman" "46004","Tamerlane","Arpasia","Mrs. Furnival" "46004","Tamerlane","Selima","Mrs. Elmy" EOT ], 'Polonius "fixed" in this test case', [ "^46005 % Hamlet.| Hamlet - Sheridan; Polonius - J. Morris; Laerte +s- Lacy; Ophelia- Mrs. Storer; Queen - Mrs. Furnival; +\n", <<'EOT', "46005","Hamlet","Hamlet","Sheridan" "46005","Hamlet","Polonius","J. Morris" "46005","Hamlet","Laertes","Lacy" "46005","Hamlet","Ophelia","Mrs. Storer" "46005","Hamlet","Queen","Mrs. Furnival" EOT ], 'one-character play', [ "^1 % Mark Twain Tonight .| Mark Twain -Hal Holbrook;+\n", <<'EOT', "1","Mark Twain Tonight","Mark Twain","Hal Holbrook" EOT ], ); VECTOR: for my $ar_vector (@Tests) { if (not ref $ar_vector) { note $ar_vector; next VECTOR; } my ($record, $expected) = @$ar_vector; my @fields = ParsePlaybill::rec $record; my $got = ParsePlaybill::fmt @fields; ok $got eq $expected, "$record" ; } # end for VECTOR
Run under Perl version 5.8.9.c:\@Work\Perl\monks\jcg3525>perl ParsePlaybill.t ok 1 - use ParsePlaybill; ok 2 - ^46004 % Tamerlane.| Tamerlane - Sheridan; Bajazet - Barry; Mon +eses - A Gentleman; Arpasia - Mrs. Furnival; Selima - Mrs. Elmy; + # # Polonius "fixed" in this test case ok 3 - ^46005 % Hamlet.| Hamlet - Sheridan; Polonius - J. Morris; Laer +tes- Lacy; Ophelia- Mrs. Storer; Queen - Mrs. Furnival; + # # one-character play ok 4 - ^1 % Mark Twain Tonight .| Mark Twain -Hal Holbrook;+ # ok 5 - no warnings 1..5
Update: As it stands, this code has some (small, I hope) problems:
Give a man a fish: <%-{-{-{-<
|
|---|