comment on

Bah. Here's a version that kind of does what you want. You're still much
better off with HTML::Parser - hopefully you'll go far with that.


#!/usr/bin/perl

use strict;
use warnings;

while(<DATA>)
{
  /^#/ and next;

  chomp;

  $_ = join ("\n", split (/-- /, $_ ) );
  $_ = &tagparity($_);
  print $_,"\n\n";

#  my $no_tags_stripped;
#  do { $no_tags_stripped = &tagstrip($_) } unless $no_tags_stripped;
#  print $_,"\n";

}


exit;


#<one>Here's -- lotsa</one> tagged words <two>that-- need</two>parity.
# This is a pretty stupid sub that will break very easily - lines
# 0, 3, 4, 5, 6, and 8 in DATA are mangled beyond belief by this sub
sub tagparity()
{

  my $string = shift;
  my @tags;
  my @temp;
  my %pairs;

  # Find pairs of tags and mate 'em up
  while ( $string =~ / <([^>]*?)> /gx ) 
  {
    push(@temp, $1);
    @temp %2 == 0 or next;
    $pairs{$temp[0]} = $temp[1];
    $pairs{$temp[1]} = $temp[0];

    push(@tags, @temp);
    @temp = ();
  };

  # Try to insert mates
  my $tag_regex = join('|',@tags);
  for my $stringbit( split(/\n/, $string ) )
  {
    while ( $stringbit =~ /($tag_regex)/g )
    {
      my $match = $1;
      my $mate = $pairs{$match};

      # If we've matched a closing tag, insert an opening
      # tag, and vice versa. This is the part that mangles
      # lines 0, 3, 4, 5, 6, and 8 in DATA
      if($match =~ m{^/}) {
        $string =~ s/$stringbit/<$mate>$stringbit/
      }
      else {
        $string =~ s/$stringbit/$stringbit<$mate>/
      }

    }

  }
  return $string;

}



sub tagstrip()
{
  my $changed = undef;

  if ( m{ <([^>]*?)> [^<]*? </\1> }x )
  {
    while ( m{ <([^>]*?)> [^<]*? </\1> }gx )
    {
      my $token = $1;
      s{<$token>}{}g;
      s{</$token>}{}g;
    }
    $changed = 1;
  }

  else
  {
    while ( m{ </([^>]*?)> }x )
    {
      my $close = $1;
      if ( m{ <($close[^>]*?)> [^<]*? </$close> }x )
      {
        my $open = $1;
        s{<$open>}{}g;
        s{</$close>}{}g;
      }
    }
    $changed = 1;
  }

  return $_, $changed;
}


__DATA__
#0This is a -- string of -- words
<b>1This is a -- string of -- words</b>
2This <b>is a -- string</b> of -- words
#3This <i>is</i> a -- <b>nested tagged string </b> of -- words
#4This <i>is a -- <b>nested set</b> of</i> -- tokens
#5This is -- an awesome -- <A HREF="http://google.com">search engine</
+A>
#6Truly an -- ugly -- <A HREF="http://perl.com"><FONT COLOR="RED">nest
+ed</FONT> string</A>
7<one>Here's -- lotsa</one> tagged words <two>that-- need</two> tag pa
+rity.
#8This string -- <b>causes -- <A HREF="http://perl.com"><FONT COLOR="R
+ED">my box</b></FONT> to hang</A>
[download]

blyman
setenv EXINIT 'set noai ts=2'

In reply to Re: Re: Re: A nice text processing question by belden
in thread A nice text processing question by moseley

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.