Re: Replacing left angle bracket with HTML entity when between two backtick characters (updated)

I posted a similar piece of code recently, but because of some shortcomings (inexplicable use of '&lgt;' instead of '<' as replacement string; inability to handle multiple '<' characters per backtick group; no handling of escaped '<' characters — the latter two features inspired by the postings of Corion and tybalt89), I've decided to post an update. This is still essentially a programming exercise; I wouldn't necessarily recommend my approach for production code, for which see the efforts of the aforementioned monks. However, even though it handles more features, it is IMHO slightly less hairy regex-wise.

File repl_lt_entity_4.pl:

use 5.010;  # needs perl version 5.10 regex extensions

use warnings;
use strict;

use Test::More 'no_plan';
use Test::NoWarnings;

my @tests = (
  'no changes to these',
  [ ('')                        x 2 ],  # unchanged pair
  [ ('`')                       x 2 ],
  [ ('``')                      x 2 ],
  [ ('```')                     x 2 ],
  [ ('<')                       x 2 ],
  [ ('<<')                      x 2 ],
  [ ('<<<')                     x 2 ],
  [ ('``<>``<>``')              x 2 ],
  [ ('is `not   <this>   one ') x 2 ],
  [ ('is  not   <this> ` one ') x 2 ],
  [ ('is `not ` <this>   one ') x 2 ],
  [ ('is  not   <this> ` one`') x 2 ],
  [ ('is  not ``<this> ` one`') x 2 ],
  [ ('is `not`  <this> ` one`') x 2 ],
  [ ('is \\\\`not       <this>       one      ') x 2 ],
  [ ('is      not       <this> \\\\` one      ') x 2 ],
  [ ('is \\\\`not     ` <this>       one      ') x 2 ],
  [ ('is     `not \\\\` <this>       one      ') x 2 ],
  [ ('is      not       <this> \\\\` one     `') x 2 ],
  [ ('is      not       <this>     ` one \\\\`') x 2 ],
  [ ('``<A>``<B>``<C>``<D>`')                    x 2 ],

  'all these should change',
  [ '`<A>``<B>``<C>``<D>`',
    '`&lt;A>``&lt;B>``&lt;C>``&lt;D>`',
    ],
  [ '`<A><B><C><D>`',               # new: multiple < in bt group
    '`&lt;A>&lt;B>&lt;C>&lt;D>`',
    ],
  [ '`<A>\<B>\\\\<C><D>`',          # new: < may be escaped
    '`&lt;A>\<B>\\\\&lt;C>&lt;D>`',
    ],
  [ '`<A>``<B>``<C>``<D>``',
    '`&lt;A>``&lt;B>``&lt;C>``&lt;D>``',
    ],
  [ '`<A> <a a>``<B> \<b b>``<C> \\\\<c c>``<D> <d d>``',
    '`&lt;A> &lt;a a>``&lt;B> \<b b>``&lt;C> \\\\&lt;c c>``&lt;D> &lt;
+d d>``',
    ],
  [ '```<A>``<B>``<C>``<D>`',
    '```&lt;A>``&lt;B>``&lt;C>``&lt;D>`',
    ],
  [ 'u    `v    <A>    `    w    `x    <B>    y`    z',
    'u    `v    &lt;A>    `    w    `x    &lt;B>    y`    z',
    ],
  [ 'u \` `v    <A>    `    w    `x    <B>    y`    z',
    'u \` `v    &lt;A>    `    w    `x    &lt;B>    y`    z',
    ],
  [ 'u    `v \` <A>    `    w    `x    <B>    y`    z',
    'u    `v \` &lt;A>    `    w    `x    &lt;B>    y`    z',
    ],
  [ 'u    `v    <A> \` `    w    `x    <B>    y`    z',
    'u    `v    &lt;A> \` `    w    `x    &lt;B>    y`    z',
    ],
  [ 'u    `v    <A>    ` \` w    `x    <B>    y`    z',
    'u    `v    &lt;A>    ` \` w    `x    &lt;B>    y`    z',
    ],
  [ 'u    `v    <A>    `    w \` `x    <B>    y`    z',
    'u    `v    &lt;A>    `    w \` `x    &lt;B>    y`    z',
    ],
  [ 'u    `v    <A>    `    w    `x \` <B>    y`    z',
    'u    `v    &lt;A>    `    w    `x \` &lt;B>    y`    z',
    ],
  [ 'u    `v    <A>    `    w    `x    <B> \` y`    z',
    'u    `v    &lt;A>    `    w    `x    &lt;B> \` y`    z',
    ],
  [ 'u    `v    <A>    `    w    `x    <B>    y` \` z',
    'u    `v    &lt;A>    `    w    `x    &lt;B>    y` \` z',
    ],

  [ 'is `my <string>` that `also <this> one` too',
    'is `my &lt;string>` that `also &lt;this> one` too',
    ],
  [ 'is \\\\` `my \\\\` <string>` that `also <this> \\\\` one` \\\\` t
+oo',
    'is \\\\` `my \\\\` &lt;string>` that `also &lt;this> \\\\` one` \
+\\\` too',
    ],
  [ 'x \\\\` `y \\\\` <A>` z `v <B> \\\\` w` \\\\` x',
    'x \\\\` `y \\\\` &lt;A>` z `v &lt;B> \\\\` w` \\\\` x',
    ],
  [ 'x        y \\\\` <A>` z `v <B> \\\\` w` \\\\` x',
    'x        y \\\\` &lt;A>` z `v &lt;B> \\\\` w` \\\\` x',
    ],
  [ 'is \`my <NO> that `but <this> one` yes',
    'is \`my <NO> that `but &lt;this> one` yes',
    ],
  '(per Corion pm#1222697)',
  [ 'is \\\\`my <this> that `but <this> one` no',
    'is \\\\`my &lt;this> that `but <this> one` no',
    ],

  'is this acceptable for < that are unbalanced?',
  [ '`<\<\\\\<``\\\\<\<<`', '`&lt;\<\\\\&lt;``\\\\&lt;\<&lt;`', ],
  [ 'is `this <UN <BALANCED> `ok?',
    'is `this &lt;UN &lt;BALANCED> `ok?',
    ],

  );

FUNT:
for my $func_name (qw(replace_lt_5  replace_lt_6)) {

  note "\n=== testing $func_name() ===\n\n";
  *replace_lt = do { no strict 'refs';  *$func_name; };

  VECTOR:
  for my $ar_vector (@tests) {

    if (not ref $ar_vector) {
      note $ar_vector;
      next VECTOR;
      }

    my ($string, $expected) = @$ar_vector;

    is replace_lt($string), $expected, qq{'$string' -> '$expected'};

    }  # end for VECTOR

  }  # end for FUNT

note "\n=== done testing functions ===\n\n";

done_testing;

exit;

# Functions UNder Test #############################################

sub replace_lt_5 {  # needs 5.10+ regex extensions

    # handles multiple < in bt group and escaped <

    my ($string,
        ) = @_;

    # replace ALL < patterns in all backtick (bt) groups in a string.
    # there may be multiple < patterns in a bt group.

    # a backtick group begins and ends with a ` (backtick)
    # that is NOT escaped.  if the escape is itself escaped,
    # it does NOT affect the bt.  so:
    #   `     can begin or end a bt group;
    #   \`    can NOT begin or end a bt group (ordinary char);
    #   \\`   escape is escaped: bt can begin/end a bt group.

    # similarly, < (lt) patterns may be escaped and the escape may
    # be escaped:
    #   <     subject to replacement;
    #   \<    escaped: < NOT subject to replacement;
    #   \\<   escape is escaped: < subject to replacement.

    $string =~ s{

      # match to just before valid lt in a valid bt group.
      (?: \G (?! \A)  # start just after some valid lt already found
        (?:
          (?&TO_NEXT_LT_IN_THIS_BT_GROUP)
          |
          (?&TO_FIRST_LT_IN_NEXT_BT_GROUP)
        )
        |
        \A (?&TO_FIRST_LT_IN_FIRST_BT_GROUP)
      )

      # grab and replace lt.
      \K          # ignore everything matched so far
      (?&TRU_LT)  # replace this

      (?(DEFINE)

        # an un-unescaped backtick: may start a bt group.
        (?<TRU_BT>    (?<! (?<! \\) \\) `)
        # an un-unescaped less-than: may be replaced.
        (?<TRU_LT>    (?<! (?<! \\) \\) <)
        # any character NOT a true backtick.
        (?<NOT_BT>    (?! (?&TRU_BT)) .)
        # any character NOT a true backtick and also NOT an lt.
        (?<NOT_BT_LT> (?! (?&TRU_LT)) (?&NOT_BT))
        # a bt-group NOT containing an lt.
        (?<EMPTY_BT>  (?> (?&TRU_BT) (?&NOT_BT_LT)*+ (?&TRU_BT)))
        # non-bt-group-with-lt stuff to ignore.
        (?<IGNORE>    (?: (?&NOT_BT)*+ (?&EMPTY_BT)*+)*+)

        # match to first valid lt in first valid bt group.
        (?<TO_FIRST_LT_IN_FIRST_BT_GROUP>
          (?&IGNORE)                    # ignore non-group stuff
          (?&TRU_BT) (?&NOT_BT_LT)*+    # positioned just before lt
          (?= (?&NOT_BT)*+ (?&TRU_BT))  # rest is valid bt
          )

        # match to first valid lt in NEXT valid bt group.
        (?<TO_FIRST_LT_IN_NEXT_BT_GROUP>
          (?&NOT_BT)*+ (?&TRU_BT)            # to end of current group
          (?&TO_FIRST_LT_IN_FIRST_BT_GROUP)  # actually in NEXT group
          )

        # match to next valid lt in current ASSUMED-valid bt group.
        (?<TO_NEXT_LT_IN_THIS_BT_GROUP> (?&NOT_BT_LT)*+)

        )  # end (DEFINE)

      }
      {&lt;}xmsg;

    return $string;

    }  # end sub replace_lt_5()

sub replace_lt_6 {  # needs 5.10+ regex extensions

    # handles multiple < in bt group and escaped <

    my ($string,
        ) = @_;

    # replace ALL < patterns in all backtick (bt) groups in a string.
    # there may be multiple < patterns in a bt group.

    # a backtick group begins and ends with a ` (backtick)
    # that is NOT escaped.  if the escape is itself escaped,
    # it does NOT affect the bt.  so:
    #   `     can begin or end a bt group;
    #   \`    can NOT begin or end a bt group (ordinary char);
    #   \\`   escape is escaped: bt can begin/end a bt group.

    # similarly, < (lt) patterns may be escaped and the escape may
    # be escaped:
    #   <     subject to replacement;
    #   \<    escaped: < NOT subject to replacement;
    #   \\<   escape is escaped: < subject to replacement.

    # an un-unescaped backtick: may start a bt group.
    my $tru_bt    = qr{ (?<! (?<! \\) \\) `               }xmso;
    # an un-unescaped less-than: may be replaced.
    my $tru_lt    = qr{ (?<! (?<! \\) \\) <               }xmso;
    # any character NOT a true backtick.
    my $not_bt    = qr{ (?! $tru_bt) .                    }xmso;
    # any character NOT a true backtick and also NOT an lt.
    my $not_bt_lt = qr{ (?! $tru_lt) $not_bt              }xmso;
    # a bt-group NOT containing an lt.
    my $empty_bt  = qr{ (?> $tru_bt $not_bt_lt*+ $tru_bt) }xmso;
    # non-bt-group-with-lt stuff to ignore.
    my $ignore    = qr{ (?: $not_bt*+ $empty_bt*+)*+      }xmso;

    # match to first valid lt in first valid bt group.
    my $to_first_lt_in_first_bt_group = qr{
        $ignore                 # ignore non-group stuff
        $tru_bt $not_bt_lt*+    # positioned just before lt
        (?= $not_bt*+ $tru_bt)  # rest is valid bt
        }xmso;

    # match to first valid lt in NEXT valid bt group.
    my $to_first_lt_in_next_bt_group = qr{
        $not_bt*+ $tru_bt               # to end of current group
        $to_first_lt_in_first_bt_group  # actually in NEXT group
        }xmso;

    # match to next valid lt in current ASSUMED-valid bt group.
    my $to_next_lt_in_this_bt_group = qr{ $not_bt_lt*+ }xmso;

    $string =~ s{

      # match to just before valid lt in a valid bt group.
      (?: \G (?! \A)  # start just after some valid lt already found
        (?:
          $to_next_lt_in_this_bt_group
          |
          $to_first_lt_in_next_bt_group
        )
        |
        \A $to_first_lt_in_first_bt_group
      )

      # grab and replace lt.
      \K       # ignore everything matched so far
      $tru_lt  # replace this

      }
      {&lt;}xmsgo;

    return $string;

    }  # end sub replace_lt_6()
[download]

I won't post the output. And more test cases never hurt.

Update: Here's a slightly more svelte version of the regex logic: it gets rid of one level of alternation nesting. I will only post the ((DEFINE) ... ) version (the qr//-factored version should flow from it in a fairly straightforward way, and there are examples of this translation in the posted code), and I'll only post a drop-in cut/paste of the s/// expression, not a full, working example, so please let me know of any fat-finger errors.

$string =~ s{

    # match to just before valid lt in a valid bt group.
    (?:
      # matching already started: just after some valid lt:
      # match to next lt.
      # (per perlop, \G must be first in regex.)
      \G (?! \A)  (?&TO_NEXT_LT_IN_THIS_OR_NEXT_VALID_BT_GROUP)
      |
      # at match start: match to first lt in bt group
      \A (?&TO_FIRST_LT_IN_VALID_BT_GROUP)
    )

    # grab and replace lt.
    \K          # ignore everything matched so far
    (?&TRU_LT)  # replace this

    (?(DEFINE)

      # un-unescaped assertion.
      (?<UNESCAPED> (?<! (?<! \\) \\) )
      # an un-unescaped backtick: may start/end a bt group.
      (?<TRU_BT>    (?&UNESCAPED) `)
      # an un-unescaped less-than: may be replaced.
      (?<TRU_LT>    (?&UNESCAPED) <)
      # any character NOT a true backtick.
      (?<NOT_BT>    (?! (?&TRU_BT)) .)
      # any character NOT a true backtick and also NOT an lt.
      (?<NOT_BT_LT> (?! (?&TRU_LT)) (?&NOT_BT))
      # a bt-group NOT containing an lt.
      (?<EMPTY_BT>  (?> (?&TRU_BT) (?&NOT_BT_LT)*+ (?&TRU_BT)))
      # non-bt-group-with-lt stuff to ignore.
      (?<IGNORE>    (?: (?&NOT_BT)*+ (?&EMPTY_BT)*+)*+)

      # to first valid lt in first succeeding valid bt group.
      # assume matching starts OUTside any valid bt group.
      (?<TO_FIRST_LT_IN_VALID_BT_GROUP>
        (?&IGNORE)                    # ignore non-group stuff
        (?&TRU_BT) (?&NOT_BT_LT)*+    # positioned just before lt
        (?= (?&NOT_BT)++ (?&TRU_BT))  # rest is valid bt
        )

      # to next or first lt in this or next valid bt group.
      # assume matching starts INside a valid bt group.
      (?<TO_NEXT_LT_IN_THIS_OR_NEXT_VALID_BT_GROUP>
        # match up to either an lt or a bt (or end of string).
        (?&NOT_BT_LT)*+
        # if a bt, end bt group, match to lt in next valid bt group.
        (?: (?&TRU_BT) (?&TO_FIRST_LT_IN_VALID_BT_GROUP))?+
        )

      )  # end (DEFINE)

    }
    {$replace}xmsg;
[download]

(The $replace at the end is for my current development testing. Set it to the replacement string, or replace it with a string literal, e.g., '<' as before.)

Give a man a fish: <%-{-{-{-<

Comment on Re: Replacing left angle bracket with HTML entity when between two backtick characters (updated) Select or Download Code