comment on

OK, this took me a lot longer than I expected it to, and the algorithm ended up a little convoluted, but I think this better handles a few edge cases...

use v5.14;
use strictures;

package Parser {
    use Moo 1.006000;
    use Types::Standard qw( RegexpRef ArrayRef );
    use Text::Balanced qw( extract_bracketed );
    use HTML::Entities qw( encode_entities );
    use namespace::autoclean;
    
    my $Allowance = RegexpRef->plus_coercions(
        ArrayRef, sub { qr/${\( join "|", map quotemeta, @$_ )}/ },
    );
    
    has allowed_tags => (
        is       => 'ro',
        isa      => $Allowance,
        coerce   => 1,
        builder  => sub {
            [qw(A ABBR ACRONYM B BIG CITE CODE DFN EM I KBD Q SAMP
            SMALL SPAN STRONG SUB SUP TT VAR)]
        },
    );
    
    sub print {
        my $self = shift;
        $self = $self->new unless ref $self;
        print $self->parse($_) for @_;
    }
    
    sub parse
    {
        my $self = shift;
        my ($text) = @_;
        
        my $tags = $self->allowed_tags;
        
        my ($before, $match) = ($text =~ m{
            \A             # start of string
            (.*?)          # leading text ($before)
            (              # either...
                \<\!--      #    the start of a comment
                |           # or...
                $tags\<     #    a tag
            )
        }xsm) or do {
            my @return = split /\|/, $text;
            $return[0] = encode_entities($return[0]);
            return @return;
        };
        
        # strip $before from $text
        substr($text, 0, length($before)) = '';
        
        # If the first thing that needed to be handled was a comment
        if ($match eq '<!--') {
            # Strip it out
            $text =~ s/\<\!--(.+?)--\>//g;
            
            # Handle the rest via recursion
            return join "", $before, $self->parse($text);
        }
                
        chop(my $found_tag = lc $match);
        substr($text, 0, length($found_tag)) = '';
        my ($got, $remainder) = extract_bracketed($text, q/<"'>/);
        $got = substr($got, 1, length($got) - 2);
        my ($markup, @attrs) = $self->parse($got);
        my ($more_markup, @more_attrs) = $self->parse($remainder);
        $_ //= '' for $markup, $more_markup;
        
        join("",
            $before,
            (@attrs ? "<$found_tag @attrs>" : "<$found_tag>"),
            $markup,
            "</$found_tag>",
            $more_markup,
        ), @more_attrs;
    }
}

Parser->print(<<'TEXT');
Anyone who watches the Syfy channel knows that on
Monday nights they aired three television series
I<A<EurSUP<e>ka|href="Movies_by_series.pl?series=EWA#EUReKA">|class="t
+itle">,
I<A<Warehouse & 13|href="Movies_by_series.pl?series=EWA#Warehouse_13">
+>,
and I<A<Alphas|href="Movies_by_series.pl?series=EWA#Alphas">>.
Some might not be aware that these three series have formed a crossove
+r
cosmology which I call A<EWA|href="Movies_by_series.pl?series=EWA">
<!-- This is a long string. -->
TEXT
[download]

toby döt ink

In reply to Re: RFC: Is there a better way to use Text::Balanced? by tobyink
in thread RFC: Is there a better way to use Text::Balanced? by Lady_Aleena

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.