# I'm guessing you meant "title" and not "html" here :)
# Also, you'll want a /s after your regex to catch titles
# that span multiple lines. Another approach is to
# strip all the \n's and/or \r's out of $whole_file
# before parsing it.
$whole_file =~ /(.*)<\/html>/i;
$file_index{ $file }{TITLE} = $1;
# Ditto with the /s
$whole_file =~ s/<[^>]*>//g;
# This will work better as "split /\s+/, $whole_file"
# since it will catch more than just single spaces
@words = split / /, $whole_file;