in reply to Replacing left angle bracket with HTML entity when between two backtick characters
I posted a similar piece of code recently, but because of some shortcomings (inexplicable use of '&lgt;' instead of '<' as replacement string; inability to handle multiple '<' characters per backtick group; no handling of escaped '<' characters — the latter two features inspired by the postings of Corion and tybalt89), I've decided to post an update. This is still essentially a programming exercise; I wouldn't necessarily recommend my approach for production code, for which see the efforts of the aforementioned monks. However, even though it handles more features, it is IMHO slightly less hairy regex-wise.
File repl_lt_entity_4.pl:
use 5.010; # needs perl version 5.10 regex extensions use warnings; use strict; use Test::More 'no_plan'; use Test::NoWarnings; my @tests = ( 'no changes to these', [ ('') x 2 ], # unchanged pair [ ('`') x 2 ], [ ('``') x 2 ], [ ('```') x 2 ], [ ('<') x 2 ], [ ('<<') x 2 ], [ ('<<<') x 2 ], [ ('``<>``<>``') x 2 ], [ ('is `not <this> one ') x 2 ], [ ('is not <this> ` one ') x 2 ], [ ('is `not ` <this> one ') x 2 ], [ ('is not <this> ` one`') x 2 ], [ ('is not ``<this> ` one`') x 2 ], [ ('is `not` <this> ` one`') x 2 ], [ ('is \\\\`not <this> one ') x 2 ], [ ('is not <this> \\\\` one ') x 2 ], [ ('is \\\\`not ` <this> one ') x 2 ], [ ('is `not \\\\` <this> one ') x 2 ], [ ('is not <this> \\\\` one `') x 2 ], [ ('is not <this> ` one \\\\`') x 2 ], [ ('``<A>``<B>``<C>``<D>`') x 2 ], 'all these should change', [ '`<A>``<B>``<C>``<D>`', '`<A>``<B>``<C>``<D>`', ], [ '`<A><B><C><D>`', # new: multiple < in bt group '`<A><B><C><D>`', ], [ '`<A>\<B>\\\\<C><D>`', # new: < may be escaped '`<A>\<B>\\\\<C><D>`', ], [ '`<A>``<B>``<C>``<D>``', '`<A>``<B>``<C>``<D>``', ], [ '`<A> <a a>``<B> \<b b>``<C> \\\\<c c>``<D> <d d>``', '`<A> <a a>``<B> \<b b>``<C> \\\\<c c>``<D> < +d d>``', ], [ '```<A>``<B>``<C>``<D>`', '```<A>``<B>``<C>``<D>`', ], [ 'u `v <A> ` w `x <B> y` z', 'u `v <A> ` w `x <B> y` z', ], [ 'u \` `v <A> ` w `x <B> y` z', 'u \` `v <A> ` w `x <B> y` z', ], [ 'u `v \` <A> ` w `x <B> y` z', 'u `v \` <A> ` w `x <B> y` z', ], [ 'u `v <A> \` ` w `x <B> y` z', 'u `v <A> \` ` w `x <B> y` z', ], [ 'u `v <A> ` \` w `x <B> y` z', 'u `v <A> ` \` w `x <B> y` z', ], [ 'u `v <A> ` w \` `x <B> y` z', 'u `v <A> ` w \` `x <B> y` z', ], [ 'u `v <A> ` w `x \` <B> y` z', 'u `v <A> ` w `x \` <B> y` z', ], [ 'u `v <A> ` w `x <B> \` y` z', 'u `v <A> ` w `x <B> \` y` z', ], [ 'u `v <A> ` w `x <B> y` \` z', 'u `v <A> ` w `x <B> y` \` z', ], [ 'is `my <string>` that `also <this> one` too', 'is `my <string>` that `also <this> one` too', ], [ 'is \\\\` `my \\\\` <string>` that `also <this> \\\\` one` \\\\` t +oo', 'is \\\\` `my \\\\` <string>` that `also <this> \\\\` one` \ +\\\` too', ], [ 'x \\\\` `y \\\\` <A>` z `v <B> \\\\` w` \\\\` x', 'x \\\\` `y \\\\` <A>` z `v <B> \\\\` w` \\\\` x', ], [ 'x y \\\\` <A>` z `v <B> \\\\` w` \\\\` x', 'x y \\\\` <A>` z `v <B> \\\\` w` \\\\` x', ], [ 'is \`my <NO> that `but <this> one` yes', 'is \`my <NO> that `but <this> one` yes', ], '(per Corion pm#1222697)', [ 'is \\\\`my <this> that `but <this> one` no', 'is \\\\`my <this> that `but <this> one` no', ], 'is this acceptable for < that are unbalanced?', [ '`<\<\\\\<``\\\\<\<<`', '`<\<\\\\<``\\\\<\<<`', ], [ 'is `this <UN <BALANCED> `ok?', 'is `this <UN <BALANCED> `ok?', ], ); FUNT: for my $func_name (qw(replace_lt_5 replace_lt_6)) { note "\n=== testing $func_name() ===\n\n"; *replace_lt = do { no strict 'refs'; *$func_name; }; VECTOR: for my $ar_vector (@tests) { if (not ref $ar_vector) { note $ar_vector; next VECTOR; } my ($string, $expected) = @$ar_vector; is replace_lt($string), $expected, qq{'$string' -> '$expected'}; } # end for VECTOR } # end for FUNT note "\n=== done testing functions ===\n\n"; done_testing; exit; # Functions UNder Test ############################################# sub replace_lt_5 { # needs 5.10+ regex extensions # handles multiple < in bt group and escaped < my ($string, ) = @_; # replace ALL < patterns in all backtick (bt) groups in a string. # there may be multiple < patterns in a bt group. # a backtick group begins and ends with a ` (backtick) # that is NOT escaped. if the escape is itself escaped, # it does NOT affect the bt. so: # ` can begin or end a bt group; # \` can NOT begin or end a bt group (ordinary char); # \\` escape is escaped: bt can begin/end a bt group. # similarly, < (lt) patterns may be escaped and the escape may # be escaped: # < subject to replacement; # \< escaped: < NOT subject to replacement; # \\< escape is escaped: < subject to replacement. $string =~ s{ # match to just before valid lt in a valid bt group. (?: \G (?! \A) # start just after some valid lt already found (?: (?&TO_NEXT_LT_IN_THIS_BT_GROUP) | (?&TO_FIRST_LT_IN_NEXT_BT_GROUP) ) | \A (?&TO_FIRST_LT_IN_FIRST_BT_GROUP) ) # grab and replace lt. \K # ignore everything matched so far (?&TRU_LT) # replace this (?(DEFINE) # an un-unescaped backtick: may start a bt group. (?<TRU_BT> (?<! (?<! \\) \\) `) # an un-unescaped less-than: may be replaced. (?<TRU_LT> (?<! (?<! \\) \\) <) # any character NOT a true backtick. (?<NOT_BT> (?! (?&TRU_BT)) .) # any character NOT a true backtick and also NOT an lt. (?<NOT_BT_LT> (?! (?&TRU_LT)) (?&NOT_BT)) # a bt-group NOT containing an lt. (?<EMPTY_BT> (?> (?&TRU_BT) (?&NOT_BT_LT)*+ (?&TRU_BT))) # non-bt-group-with-lt stuff to ignore. (?<IGNORE> (?: (?&NOT_BT)*+ (?&EMPTY_BT)*+)*+) # match to first valid lt in first valid bt group. (?<TO_FIRST_LT_IN_FIRST_BT_GROUP> (?&IGNORE) # ignore non-group stuff (?&TRU_BT) (?&NOT_BT_LT)*+ # positioned just before lt (?= (?&NOT_BT)*+ (?&TRU_BT)) # rest is valid bt ) # match to first valid lt in NEXT valid bt group. (?<TO_FIRST_LT_IN_NEXT_BT_GROUP> (?&NOT_BT)*+ (?&TRU_BT) # to end of current group (?&TO_FIRST_LT_IN_FIRST_BT_GROUP) # actually in NEXT group ) # match to next valid lt in current ASSUMED-valid bt group. (?<TO_NEXT_LT_IN_THIS_BT_GROUP> (?&NOT_BT_LT)*+) ) # end (DEFINE) } {<}xmsg; return $string; } # end sub replace_lt_5() sub replace_lt_6 { # needs 5.10+ regex extensions # handles multiple < in bt group and escaped < my ($string, ) = @_; # replace ALL < patterns in all backtick (bt) groups in a string. # there may be multiple < patterns in a bt group. # a backtick group begins and ends with a ` (backtick) # that is NOT escaped. if the escape is itself escaped, # it does NOT affect the bt. so: # ` can begin or end a bt group; # \` can NOT begin or end a bt group (ordinary char); # \\` escape is escaped: bt can begin/end a bt group. # similarly, < (lt) patterns may be escaped and the escape may # be escaped: # < subject to replacement; # \< escaped: < NOT subject to replacement; # \\< escape is escaped: < subject to replacement. # an un-unescaped backtick: may start a bt group. my $tru_bt = qr{ (?<! (?<! \\) \\) ` }xmso; # an un-unescaped less-than: may be replaced. my $tru_lt = qr{ (?<! (?<! \\) \\) < }xmso; # any character NOT a true backtick. my $not_bt = qr{ (?! $tru_bt) . }xmso; # any character NOT a true backtick and also NOT an lt. my $not_bt_lt = qr{ (?! $tru_lt) $not_bt }xmso; # a bt-group NOT containing an lt. my $empty_bt = qr{ (?> $tru_bt $not_bt_lt*+ $tru_bt) }xmso; # non-bt-group-with-lt stuff to ignore. my $ignore = qr{ (?: $not_bt*+ $empty_bt*+)*+ }xmso; # match to first valid lt in first valid bt group. my $to_first_lt_in_first_bt_group = qr{ $ignore # ignore non-group stuff $tru_bt $not_bt_lt*+ # positioned just before lt (?= $not_bt*+ $tru_bt) # rest is valid bt }xmso; # match to first valid lt in NEXT valid bt group. my $to_first_lt_in_next_bt_group = qr{ $not_bt*+ $tru_bt # to end of current group $to_first_lt_in_first_bt_group # actually in NEXT group }xmso; # match to next valid lt in current ASSUMED-valid bt group. my $to_next_lt_in_this_bt_group = qr{ $not_bt_lt*+ }xmso; $string =~ s{ # match to just before valid lt in a valid bt group. (?: \G (?! \A) # start just after some valid lt already found (?: $to_next_lt_in_this_bt_group | $to_first_lt_in_next_bt_group ) | \A $to_first_lt_in_first_bt_group ) # grab and replace lt. \K # ignore everything matched so far $tru_lt # replace this } {<}xmsgo; return $string; } # end sub replace_lt_6()
Update: Here's a slightly more svelte version of the regex logic: it gets rid of one level of alternation nesting. I will only post the ((DEFINE) ... ) version (the qr//-factored version should flow from it in a fairly straightforward way, and there are examples of this translation in the posted code), and I'll only post a drop-in cut/paste of the s/// expression, not a full, working example, so please let me know of any fat-finger errors.
$string =~ s{ # match to just before valid lt in a valid bt group. (?: # matching already started: just after some valid lt: # match to next lt. # (per perlop, \G must be first in regex.) \G (?! \A) (?&TO_NEXT_LT_IN_THIS_OR_NEXT_VALID_BT_GROUP) | # at match start: match to first lt in bt group \A (?&TO_FIRST_LT_IN_VALID_BT_GROUP) ) # grab and replace lt. \K # ignore everything matched so far (?&TRU_LT) # replace this (?(DEFINE) # un-unescaped assertion. (?<UNESCAPED> (?<! (?<! \\) \\) ) # an un-unescaped backtick: may start/end a bt group. (?<TRU_BT> (?&UNESCAPED) `) # an un-unescaped less-than: may be replaced. (?<TRU_LT> (?&UNESCAPED) <) # any character NOT a true backtick. (?<NOT_BT> (?! (?&TRU_BT)) .) # any character NOT a true backtick and also NOT an lt. (?<NOT_BT_LT> (?! (?&TRU_LT)) (?&NOT_BT)) # a bt-group NOT containing an lt. (?<EMPTY_BT> (?> (?&TRU_BT) (?&NOT_BT_LT)*+ (?&TRU_BT))) # non-bt-group-with-lt stuff to ignore. (?<IGNORE> (?: (?&NOT_BT)*+ (?&EMPTY_BT)*+)*+) # to first valid lt in first succeeding valid bt group. # assume matching starts OUTside any valid bt group. (?<TO_FIRST_LT_IN_VALID_BT_GROUP> (?&IGNORE) # ignore non-group stuff (?&TRU_BT) (?&NOT_BT_LT)*+ # positioned just before lt (?= (?&NOT_BT)++ (?&TRU_BT)) # rest is valid bt ) # to next or first lt in this or next valid bt group. # assume matching starts INside a valid bt group. (?<TO_NEXT_LT_IN_THIS_OR_NEXT_VALID_BT_GROUP> # match up to either an lt or a bt (or end of string). (?&NOT_BT_LT)*+ # if a bt, end bt group, match to lt in next valid bt group. (?: (?&TRU_BT) (?&TO_FIRST_LT_IN_VALID_BT_GROUP))?+ ) ) # end (DEFINE) } {$replace}xmsg;
Give a man a fish: <%-{-{-{-<
|
|---|