# strip html tags $text =~ s/<[^>]*>//g; # strip special chars $text =~ s/&[^;]*;//g; # shove resulting words into an array my @words = $text =~ /(\w+\'*\w+)/g;