#!/usr/bin/perl
use strict;
use Encode;
while (<>) {
my $utf8 = decode( 'iso8859-1', $_ );
my @words = ( $utf8 =~ /\b(\w+)\b/g );
print join "\n", map { encode( 'iso8859-1', $_ ) } @words;
print "\n";
}
####
latin1-tokenizer < latin1.txt > latin1.tkns
####
#!/usr/bin/perl
use strict;
open( IN, "<:encoding(iso8859-1)", $ARGV[0] ) or die "couldn't read $ARGV[0]: $!";
binmode STDOUT, ":encoding(iso8859-1)";
while () {
my @words = ( /\b(\w+)\b/g );
print join "\n", @words;
print "\n";
}
# run it like this: tokenizer latin1.txt > latin1.tkns