# Tokenise my @tokens = $doc =~ /<[^>]*>|./sg; # Extract text my @text_tokens = grep /^<|<[0-9]+>/, @tokens; # Do stuff with @text_tokens # ... # Print resulting text print join('', @text_tokens);