in reply to HTML pages indexer

Here is what I came up with. Notes follow below it.

#/usr/bin/perl -w use strict; my (%file_index, %words_index); opendir( DIR, ".") or die "Could not read from current directory - $!\ +n"; my @files = grep { -f and /\.html$/} readdir DIR; closedir DIR; local $/; my $default_description = "No description"; my $default_title = "No title"; for my $file (@files) { open(FILE, "$file") or die "Can't open file $file - $!\n"; my $whole_file = <FILE>; close(FILE); my $title; if ($whole_file =~ /<TITLE>\s*(.*)\s*<\/TITLE>/is) { ($title=$1) =~ s/\s+/ /g; $title =~ s/ *$//; $title =~ s/ +/ /g; } $file_index{$file}{'TITLE'} = defined $title ? $title : $default_tit +le; ## Similar stuff (using META tags perhaps?) goes here $file_index{$file}{'DESCRIPTION'} = $default_description; for (split(/\W+/, join(" ", split(/<[^>]*>/, $whole_file)))) { $words_index{$_}{$file}++; } print "$file - ($file_index{$file}{'TITLE'}) - " . "$file_index{$file}{'DESCRIPTION'}\n"; }

Code with notes:

#/usr/bin/perl -w

use strict;
my (%file_index, %words_index);

## Always check the result of opendir:
opendir( DIR, ".") or die "Could not read from current directory - $!\n";
my @files = grep { -f and /\.html$/} readdir DIR;
closedir DIR;

## It is better to localize global variables than to just undefine
## them. At the very least, store the value before undefining it
## so you can restore it later. (which is basically what local does anyway)

local $/;

my $default_description = "No description";

## Might as well add a default title to go with the other default:

my $default_title = "No title";

for my $file (@files) {

open(FILE, "$file") or die "Can't open file $file - $!\n";

## Might as well close the file as soon as we are done with it:

my $whole_file = <FILE>; close(FILE);

my $title;

## Need the if statement because $1 might hang around
## from a previous match and mess us up:

if ($whole_file =~ /<TITLE>\s*(.*)\s*<\/TITLE>/is) {

## This mess removes newlines and extra spaces from the title
## First change whitespace (e.g. tabs, and newlines) to spaces,
## then remove trailing spaces, then compress all whitespace
## (This is also a good argument to consider using an already written HTML parser from CPAN)

  ($title=$1) =~ s/\s+/ /g;
  $title =~ s/ *$//;
  $title =~ s/  +/ /g;
}


## "$foo = $bar || $baz;" looks cooler, but doesn't account
## for people who title their page "0" - hence the ternary test :)

$file_index{$file}{'TITLE'} = defined $title ? $title : $default_title;

## Similar stuff (using META tags perhaps?) goes here
$file_index{$file}{'DESCRIPTION'} = $default_description;

## This just splits out the HTML, then splits the resulting words on
## whitespace. Note the join is using a space, not a blank.
## Storing them into temporary arrays would look neater, but be more wasteful

for (split(/\W+/, join(" ", split(/<[^>]*>/, $whole_file)))) {
  $words_index{$_}{$file}++;
}


print "$file - ($file_index{$file}{'TITLE'}) - " .
      "$file_index{$file}{'DESCRIPTION'}\n";

}

Replies are listed 'Best First'.
RE: Re: HTML pages indexer
by larsen (Parson) on Aug 08, 2000 at 01:51 UTC
    Thank you very much.
    As soon as possible, I will vote for your post :)
    I naively burn out my votes this morning :)
    Larsen