#!/usr/bin/perl -Twl ###################################################################### # Which language usually uses the longest words? #### # coming from a recreational conversation, this issue was transformed # by the PerlMonks fellows in a real coding debate, so here I come # back with a complete and hopefully quite clean code, in order to # offer some useful maybe to the profane from this recreational thread # #### # AUTHOR: # Marius FERARU, aka AltBlue # DATE: # 2002.11.13 # LICENSE: # Beerware, as the source of all this debate :P~ ###################################################################### use strict; use LWP::Simple; use HTML::Parser 3.00; use POSIX qw(setlocale LC_CTYPE); use locale; use Data::Dumper; { ### Locale, URL pairs that are to be parsed my $ToParse = [ [ 'ca', 'file:/var/www/html/index.html.ca' ], [ 'cz', 'file:/var/www/html/index.html.cz' ], [ 'de', 'file:/var/www/html/index.html.de' ], [ 'dk', 'file:/var/www/html/index.html.dk' ], [ 'ee', 'file:/var/www/html/index.html.ee' ], [ 'el', 'file:/var/www/html/index.html.el' ], [ 'en', 'file:/var/www/html/index.html.en' ], [ 'es', 'file:/var/www/html/index.html.es' ], [ 'fr', 'file:/var/www/html/index.html.fr' ], [ 'it', 'file:/var/www/html/index.html.it' ], [ 'nl', 'file:/var/www/html/index.html.nl' ], [ 'nn', 'file:/var/www/html/index.html.nn' ], [ 'no', 'file:/var/www/html/index.html.no' ], [ 'pt', 'file:/var/www/html/index.html.pt' ], [ 'ru', 'file:/var/www/html/index.html.ru.koi8-r' ], [ 'se', 'file:/var/www/html/index.html.se' ], [ 'zh', 'file:/var/www/html/index.html.zh' ], ]; run_tests($ToParse); } sub run_tests { local $" = ', '; foreach (@{$_[0]}) { setlocale(LC_CTYPE, $_->[0]); print $_->[1]; my @lw = longest_words(strip_html(get($_->[1]))); print length($lw[0]), ' letters: ', "@lw"; } } ###################################################################### # Grabs a chunk of data, computes a list of the longest unique words # around and returns it sorted alphabetically sub longest_words { my $data = shift || return (); my $max = 0; my %longest; while ($data =~ /\b(\w+)\b/sg) { my $word = $1; my $length = length $word; next if $length < $max; if( $max < $length ) { $max = $length; %longest = (); $longest{$word} = 1; } elsif( $max == $length ) { $longest{$word} = 1; } } sort { lc($a) cmp lc($b) } keys %longest; } ###################################################################### # HTML stripping routine sub strip_html { my $buffer = ''; my $p = new HTML::Parser ( api_version => 3, marked_sections => 1, text_h => [ sub { $buffer .= $/ . $_[0] }, 'dtext' ], ); $p->ignore_elements(qw(script style)); $p->parse("@_"); $p->eof; $buffer; }