# Tokenise my @tokens = $doc =~ /<[^>]*>|./sg; # Extract text my $text = ''; for (@tokens) { if (/^<([0-9]+)>/) { $text .= chr($1); } elsif (/^[^<]/) { $text .= $1; } } # Do stuff with $text # ... # Print resulting text print join('', $text);