#!/usr/bin/perl -Twl
######################################################################
# Which language usually uses the longest words?
####
# coming from a recreational conversation, this issue was transformed
# by the PerlMonks fellows in a real coding debate, so here I come
# back with a complete and hopefully quite clean code, in order to
# offer some useful maybe to the profane from this recreational thread
#   
####
# AUTHOR:
#     Marius FERARU, aka AltBlue
# DATE:
#     2002.11.13
# LICENSE:
#     Beerware, as the source of all this debate :P~
######################################################################
use strict;
use LWP::Simple;
use HTML::Parser 3.00;
use POSIX qw(setlocale LC_CTYPE);
use locale;
use Data::Dumper;

{ ### Locale, URL pairs that are to be parsed
  my $ToParse = [
    [ 'ca', 'file:/var/www/html/index.html.ca' ],
    [ 'cz', 'file:/var/www/html/index.html.cz' ],
    [ 'de', 'file:/var/www/html/index.html.de' ],
    [ 'dk', 'file:/var/www/html/index.html.dk' ],
    [ 'ee', 'file:/var/www/html/index.html.ee' ],
    [ 'el', 'file:/var/www/html/index.html.el' ],
    [ 'en', 'file:/var/www/html/index.html.en' ],
    [ 'es', 'file:/var/www/html/index.html.es' ],
    [ 'fr', 'file:/var/www/html/index.html.fr' ],
    [ 'it', 'file:/var/www/html/index.html.it' ],
    [ 'nl', 'file:/var/www/html/index.html.nl' ],
    [ 'nn', 'file:/var/www/html/index.html.nn' ],
    [ 'no', 'file:/var/www/html/index.html.no' ],
    [ 'pt', 'file:/var/www/html/index.html.pt' ],
    [ 'ru', 'file:/var/www/html/index.html.ru.koi8-r' ],
    [ 'se', 'file:/var/www/html/index.html.se' ],
    [ 'zh', 'file:/var/www/html/index.html.zh' ],
  ];
  run_tests($ToParse);
}

sub run_tests {
    local $" = ', ';
    foreach (@{$_[0]}) {
        setlocale(LC_CTYPE, $_->[0]);
        print $_->[1];
        my @lw = longest_words(strip_html(get($_->[1])));
        print length($lw[0]), ' letters: ', "@lw";
    }
}
 
######################################################################
# Grabs a chunk of data, computes a list of the longest unique words  
# around and returns it sorted alphabetically
sub longest_words {
    my $data = shift || return ();
    my $max = 0;
    my %longest;
    while ($data =~ /\b(\w+)\b/sg) {
        my $word = $1;
        my $length = length $word;
        next if $length < $max;   

        if( $max < $length ) {
            $max = $length;   
            %longest = ();    
            $longest{$word} = 1;
        }
        elsif( $max == $length ) {
            $longest{$word} = 1;  
        }
    }
    sort { lc($a) cmp lc($b) } keys %longest;
}

######################################################################
# HTML stripping routine
sub strip_html {
    my $buffer = '';
    my $p = new HTML::Parser (
             api_version     => 3,
             marked_sections => 1,
             text_h  => [ sub { $buffer .= $/ . $_[0] }, 'dtext' ],
            );
    $p->ignore_elements(qw(script style));
    $p->parse("@_");
    $p->eof;
    $buffer;
}