my %freq; while () { s/<.+?>/ /g; # replace tags with spaces tr/A-Z0-9?!.,:;()*"`'-/a-z /s; # convert upper- to lower-case, and # also convert digits, punct to space # NOTE: check your output to see whether any other punctuation or # non-word characters are getting through, and add those to the tr/// # as needed; also: hyphens might need to be treated differently from # other punctuation (keep as-is, or delete, instead of converting to space?) s/$stopregex//g; # remove any/all stop words # at this point, line should contain only word tokens, but # use grep, just in case: for my $token ( grep /[a-z]/, split ) { # only count tokens with letters $freq{$token}++; } }