#/usr/bin/perl -w
use strict;
my (%file_index, %words_index);
opendir( DIR, ".") or die "Could not read from current directory - $!\
+n";
my @files = grep { -f and /\.html$/} readdir DIR;
closedir DIR;
local $/;
my $default_description = "No description";
my $default_title = "No title";
for my $file (@files) {
open(FILE, "$file") or die "Can't open file $file - $!\n";
my $whole_file = <FILE>; close(FILE);
my $title;
if ($whole_file =~ /<TITLE>\s*(.*)\s*<\/TITLE>/is) {
($title=$1) =~ s/\s+/ /g;
$title =~ s/ *$//;
$title =~ s/ +/ /g;
}
$file_index{$file}{'TITLE'} = defined $title ? $title : $default_tit
+le;
## Similar stuff (using META tags perhaps?) goes here
$file_index{$file}{'DESCRIPTION'} = $default_description;
for (split(/\W+/, join(" ", split(/<[^>]*>/, $whole_file)))) {
$words_index{$_}{$file}++;
}
print "$file - ($file_index{$file}{'TITLE'}) - " .
"$file_index{$file}{'DESCRIPTION'}\n";
}
Code with notes:
#/usr/bin/perl -w
use strict;
my (%file_index, %words_index);
## Always check the result of opendir:
opendir( DIR, ".") or
die "Could not read from current directory - $!\n";
my @files = grep { -f and /\.html$/} readdir DIR;
closedir DIR;
## It is better to localize global variables than to just undefine
## them. At the very least, store the value before undefining it
## so you can restore it later. (which is basically what local does anyway)
local $/;
my $default_description = "No description";
## Might as well add a default title to go with the other default:
my $default_title = "No title";
for my $file (@files) {
open(FILE, "$file") or die "Can't open file $file - $!\n";
## Might as well close the file as soon as we are done with it:
my $whole_file = <FILE>; close(FILE);
my $title;
## Need the if statement because $1 might hang around
## from a previous match and mess us up:
if ($whole_file =~ /<TITLE>\s*(.*)\s*<\/TITLE>/is) {
## This mess removes newlines and extra spaces from the title
## First change whitespace (e.g. tabs, and newlines) to spaces,
## then remove trailing spaces, then compress all whitespace
## (This is also a good argument to consider using an already written HTML parser from CPAN)
($title=$1) =~ s/\s+/ /g;
$title =~ s/ *$//;
$title =~ s/ +/ /g;
}
## "$foo = $bar || $baz;" looks cooler, but doesn't account
## for people who title their page "0" - hence the ternary test :)
$file_index{$file}{'TITLE'} = defined $title ? $title : $default_title;
## Similar stuff (using META tags perhaps?) goes here
$file_index{$file}{'DESCRIPTION'} = $default_description;
## This just splits out the HTML, then splits the resulting words on
## whitespace. Note the join is using a space, not a blank.
## Storing them into temporary arrays would look neater, but be more wasteful
for (split(/\W+/, join(" ", split(/<[^>]*>/, $whole_file)))) {
$words_index{$_}{$file}++;
}
print "$file - ($file_index{$file}{'TITLE'}) - " .
"$file_index{$file}{'DESCRIPTION'}\n";
}
|