Ms Foo
Mr Bar
Mss Toto
Dr Tata
####
Ms Foo
Mr Bar
Mss Toto
Dr Tata
##
##
my %change=( 'CommitteeList' => 'person', # CommitteeList tag should become person
'Ch_Chair' => 'person[officer]'); # Ch_Chair tag should become person too
# but consecutive officers should then be
# wrapped in an officer tag
my %wrap= ( officers => 'person[officer]+', # this defines the wrapping
perslist => 'officers?, person+'); # then we wrap the whole list in a perslist tag
my @wrap=( 'officers', 'perslist'); # to process them in the proper order
##
##
my %wrapper; # stores the regular expression generated from %wrap
local $/=""; # I have previously inserted those tags so I can split my file in independant chunks
while(my $bit= <$infile>)
{ # the chunk tag is removed here, amongst other irrelevant things
foreach my $tag ( keys %change)
{ change_tag( $bit, $tag, $change{$tag}); } # looks easy isn't it?
foreach my $tag (@wrap)
{ wrap( $bit, $tag, $wrap{$tag}); } # easy too!
# remove the extra er attributes (amongst other things) here
print $bit; # spit it out
}
# this one is easy
# change source_tag into target_tag, adding an er attribute if specified
sub change_tag($$$)
{ my $source_tag = $_[1] || warn "no source_tag";
my $target_tag= $_[2] || warn "no target_tag";
my $target_att='';
# check if an attribute should be included (in brackets)
if( $target_tag=~ /(\w+)\[(\w+)\]/)
{ $target_tag=$1; $target_att=$2 };
# replace opening tags, existing attributes are untouched
if( $target_att)
{ $_[0]=~ s{<$source_tag\b} {<$target_tag er="$target_att"}gs; }
else
{ $_[0]=~ s{<$source_tag\b} {<$target_tag}gs; }
# replace end tags
$_[0]=~ s{$source_tag>} {$target_tag>}gs;
}
# this one is a little scarier (especially the last line!)
sub wrap($$$)
{ my $tag= $_[1];
my $expr=$_[2];
$wrapper{$tag}||= make_wrapper( $expr, $tag);
&{$wrapper{$tag}}; # Gee, this looks weird! Did I unknowingly use the
# "call this function with the same @_" trick? I guess so
}
# this one is the one that does the real work
sub make_wrapper($$)
{ my( $expr, $tag)= @_;
my $att= '';
my $subr;
# figure out whether an attribute should be included
if( $tag=~ /(\w+)\[(\w+)\]/)
{ $tag=$1; $att=$2 };
# build regexp from the nicer syntax
$expr=~ s{(\w+)\b(?![\[\]])}{(<$1.*?$1>\\\s*)}g; # no attribute given
$expr=~ s{(\w+)\[(\w+)\]}{(<$1 er=\"$2\".*?$1>\\\s*)}g; # attribute given
$expr=~ s{,\s*} {\\\s*}g;
# now build the wrapper subroutine, replacing the expression by the tag
if( $att)
{ $subr= "{ ".'$_[0]'."=~ s{($expr)}{<$tag er=\"$att\">\n".'$1'."\n$tag>}sgo;} "; }
else
{ $subr= "{ ".'$_[0]'."=~ s{($expr)}{<$tag>\n".'$1'."\n$tag>}sgo;} "; }
return eval "sub { $subr }";
}