Text to XHTML

Enclose paragraphs separated by blank lines; place before single newlines. Be smart about block elements like <blockquote>, e.g.:

I quote Lord Pilaf:

<blockquote>
Blah blah blah.

Blah!
</blockquote>

becomes

I quote Lord Pilaf:

my %is_approved_element = map { $_ => 1 } qw(a b big blockquote br cen
+ter cite
                         code dd div dl dt em font hr i img 
                         i ol p pre samp small span strong
                         sub sup table td th tr tt u ul);

my %is_block_element = map { $_ => 1 } qw(blockquote center div dl ol 
+pre table ul);

my %is_raw_element = map { $_ => 1 } qw(dl ol pre table ul);

my %is_empty_element = map { $_ => 1} qw(br img hr);

sub render_text {
    my $str = shift;
    my $no_margin = shift;
    my $buf;
    my @tag_stack;
    my $raw;
    my $nl;
    my $in_p;

    $str =~ s/&(?!\#?[a-zA-Z0-9]+;)/&amp;/g;
    $str =~ s{<person>([^<>]+)</person>} {sprintf(qq|<a href="%s?%s">%
+s</a>|, $script_name, user_id($1), $1)}gei;

    while ($str) {
    my $out;
    my $need_p = !$in_p && !$raw;

    if ($str =~ s/^([^<>\n]+)//) {
        $out = $1;

        for ($out) {
        my $in_a = grep { $_ eq 'a' } @tag_stack;

        unless ($in_a) {
            s{((?:http|https|ftp|mailto):\S+[a-zA-Z0-9/])} {<a href="$
+1">$1</a>}g;
        }
        }
    } elsif ($str =~ s/^<//) {
        if ($str =~ s/^(\/?)([a-zA-Z][a-zA-Z0-9\.-]*)(.*?)>//) {
        my $close = $1;
        my $name = lc $2;
        my $attributes = $3;

        if ($is_approved_element{$name}) {
            if ($is_block_element{$name}) {
            # A <p> tag produced by \n\n should go after a <blockquote
+> tag,
            # not before it.
            $need_p = 0;

            if ($close && $in_p) {
                $buf .= '</p>';
                $in_p = 0;
            }

            if ($is_raw_element{$name}) {
                if ($close) {
                $raw--;
                } else {
                $raw++;
                }
            }
            }

            if ($close) {
            # Catch a pending \n so it doesn't get translated to <br /
+>.
            if ($nl) {
                $out .= "\n";
                $nl = 0;
            }

            if ($tag_stack[0] eq $name) {
                $out .= "</$name>";
                shift(@tag_stack);
            }
            } else {
            $out .= "<$name$attributes>";

            unless ($is_empty_element{$name}) {
                unshift(@tag_stack, $name);

                $str =~ s/^(\n+)//;
                $out .= $1;
            }
            }
        } else {
            $out = "&lt;$close$name$attributes&gt;";
        }
        } else {
        $out = '&lt;';
        }
    } elsif ($str =~ s/^>//) {
        $out = '&gt;';
    } elsif ($str =~ s/^(\n{2,})//) {
        if ($in_p) {
        $buf .= '</p>';
        $in_p = 0;
        }
        $buf .= $1;
    } elsif ($str =~ s/^\n//) {
        if ($raw) {
        $buf .= "\n";
        } else {
        $nl = 1;
        }
    }

    if ($out) {
        if ($nl) {
        $buf .= "<br />\n";
        $nl = 0;
        }

        if ($need_p) {
        $buf .= '<p>';
        $in_p = 1;
        }

        $buf .= $out;
    }
    }

    for my $name (@tag_stack) {
    $buf .= "</$name>";
    }

    if ($in_p) {
    $buf .= '</p>';
    }

    if ($no_margin) {
    my $first_p = index($buf, '<p>');
    my $last_p = rindex($buf, '<p>');

    if ($first_p == $last_p) {
        substr($buf, $first_p + 2, 0) = q| style="margin: 0"|;
    } else {
        if ($last_p >= 0) {
        substr($buf, $last_p + 2, 0) = q| style="margin-bottom: 0"|;
        }

        if ($first_p >= 0) {
        substr($buf, $first_p + 2, 0) = q| style="margin-top: 0"|;
        }
    }
    }

    return $buf;
}
[download]

Comment on Text to XHTML Download Code

Replies are listed 'Best First'.
Re: Text to XHTML by Aristotle (Chancellor) on Sep 01, 2002 at 20:39 UTC
You may want to have a look at HTML::FromText. Makeshifts last the longest.	[reply]
Re: Text to XHTML by widget (Initiate) on Sep 13, 2002 at 14:20 UTC
Also look at HTML::EasyTags	[reply]