# I'm guessing you meant "title" and not "html" here :)
   # Also, you'll want a /s after your regex to catch titles
   # that span multiple lines.  Another approach is to
   # strip all the \n's and/or \r's out of $whole_file
   # before parsing it.
   $whole_file =~ /<html>(.*)<\/html>/i;
   $file_index{ $file }{TITLE} = $1;

   # Ditto with the /s
   $whole_file =~ s/<[^>]*>//g;

   # This will work better as "split /\s+/, $whole_file"
   # since it will catch more than just single spaces
   @words = split / /, $whole_file;