while( $html !~ m#\G$#gc ) { if( $html =~ m#\G([^&<]+)#gc ) { # $1 is plain text } elsif( $html =~ m#\G#gc ) { # $1 is a comment # I think HTML comments are defined by the standard # to actually be more complex than that, but the # practical definition appears to match the above. } elsif( $html =~ m#\G<((?:[^"'>]+|"[^"]*"|'[^']*')*)>#gc ) { # $1 is the inside of a tag } elsif( $html =~ m#\G&(\w+);#gc ) { # $1 is the name of an entity } elsif( $html =~ m!\G&#(\w+);!gc ) { # $1 is the number of an entity } else { # We have hit invalid HTML. # You can try to be lenient here if you like: if( $html =~ m#\G([&<])#gc ) { # Treat like a & or < if you like } else { die "Impossible??"; } } }