Finally, let's end this initially wanted recreational thread
with a more serious like post... here is the complete code
for this 'longest words' thingie.
Please note that this is a revisited version, using nice and clean (a.k.a. 'serious') coding style :))
#!/usr/bin/perl -Twl
######################################################################
# Which language usually uses the longest words?
####
# coming from a recreational conversation, this issue was transformed
# by the PerlMonks fellows in a real coding debate, so here I come
# back with a complete and hopefully quite clean code, in order to
# offer some useful maybe to the profane from this recreational thread
#
####
# AUTHOR:
# Marius FERARU, aka AltBlue
# DATE:
# 2002.11.13
# LICENSE:
# Beerware, as the source of all this debate :P~
######################################################################
use strict;
use LWP::Simple;
use HTML::Parser 3.00;
use POSIX qw(setlocale LC_CTYPE);
use locale;
use Data::Dumper;
{ ### Locale, URL pairs that are to be parsed
my $ToParse = [
[ 'ca', 'file:/var/www/html/index.html.ca' ],
[ 'cz', 'file:/var/www/html/index.html.cz' ],
[ 'de', 'file:/var/www/html/index.html.de' ],
[ 'dk', 'file:/var/www/html/index.html.dk' ],
[ 'ee', 'file:/var/www/html/index.html.ee' ],
[ 'el', 'file:/var/www/html/index.html.el' ],
[ 'en', 'file:/var/www/html/index.html.en' ],
[ 'es', 'file:/var/www/html/index.html.es' ],
[ 'fr', 'file:/var/www/html/index.html.fr' ],
[ 'it', 'file:/var/www/html/index.html.it' ],
[ 'nl', 'file:/var/www/html/index.html.nl' ],
[ 'nn', 'file:/var/www/html/index.html.nn' ],
[ 'no', 'file:/var/www/html/index.html.no' ],
[ 'pt', 'file:/var/www/html/index.html.pt' ],
[ 'ru', 'file:/var/www/html/index.html.ru.koi8-r' ],
[ 'se', 'file:/var/www/html/index.html.se' ],
[ 'zh', 'file:/var/www/html/index.html.zh' ],
];
run_tests($ToParse);
}
sub run_tests {
local $" = ', ';
foreach (@{$_[0]}) {
setlocale(LC_CTYPE, $_->[0]);
print $_->[1];
my @lw = longest_words(strip_html(get($_->[1])));
print length($lw[0]), ' letters: ', "@lw";
}
}
######################################################################
# Grabs a chunk of data, computes a list of the longest unique words
# around and returns it sorted alphabetically
sub longest_words {
my $data = shift || return ();
my $max = 0;
my %longest;
while ($data =~ /\b(\w+)\b/sg) {
my $word = $1;
my $length = length $word;
next if $length < $max;
if( $max < $length ) {
$max = $length;
%longest = ();
$longest{$word} = 1;
}
elsif( $max == $length ) {
$longest{$word} = 1;
}
}
sort { lc($a) cmp lc($b) } keys %longest;
}
######################################################################
# HTML stripping routine
sub strip_html {
my $buffer = '';
my $p = new HTML::Parser (
api_version => 3,
marked_sections => 1,
text_h => [ sub { $buffer .= $/ . $_[0] }, 'dtext' ],
);
$p->ignore_elements(qw(script style));
$p->parse("@_");
$p->eof;
$buffer;
}
--
AltBlue.