m#<([^"'>]+|"[^"]*"|'[^']*')*>#
####
m#(<(?:[^"'>]+|"[^"]*"|'[^']*')*>)#
####
while( $html !~ m#\G$#gc ) {
if( $html =~ m#\G([^&<]+)#gc ) {
# $1 is plain text
} elsif( $html =~ m#\G#gc ) {
# $1 is a comment
# I think HTML comments are defined by the standard
# to actually be more complex than that, but the
# practical definition appears to match the above.
} elsif( $html =~ m#\G<((?:[^"'>]+|"[^"]*"|'[^']*')*)>#gc ) {
# $1 is the inside of a tag
} elsif( $html =~ m#\G&(\w+);#gc ) {
# $1 is the name of an entity
} elsif( $html =~ m!\G(\w+);!gc ) {
# $1 is the number of an entity
} else {
# We have hit invalid HTML.
# You can try to be lenient here if you like:
if( $html =~ m#\G([&<])#gc ) {
# Treat like a & or < if you like
} else {
die "Impossible??";
}
}
}