#!/usr/bin/perl # Program reads in any and all files named "Vortaro_.*" from the current directory. # It assumes each file to contain one or more jumbled dictionaries in Esperanto from # some EPub at Eldonejo Mistera Sturno. For each file read in, a new file "new_Vortaro_.*\.txt" # is written, no longer jumbled but fully alphabetized. It may, however, have some duplicates. # This is because the word was defined plurally in the jumble. A list of these duplicates will # be listed in a file "keys_plural.txt" as a guide to manual sorting out. use utf8; use open qw(:std :utf8); use Cwd; ################### # BEGIN USER DEFS # ################### # Where things are. my $dir_input = './'; my $dir_output = './'; # Regular Expression variables my $regex_0 = qr(Vortaro_.*); # Input files to parse my $regex_1 = qr(
); # Left boundary of definition word.
my $regex_2 = qr(<.*); # Right boundary of definition word.
#################
# END USER DEFS #
#################
my @file_list = ();
# The array of all characters in Esperanto which are to factor in alphabetic sorting.
my @zam = qw( / A B C Ĉ D E F G Ĝ H Ĥ I J Ĵ K L M N O P R S Ŝ T U Ŭ V Z a á b c ĉ d e é f g ĝ h ĥ i ï j ĵ k l m n o ó p r s ŝ t u ŭ ú v z );
# An equal-or-larger array corresponding to the above but in ASCII sorting order.
# Duplicates exist to make the sorting not case sensitive.
my @abc = qw( / 0 2 3 4 5 6 8 9 A B C D F G H I J K L N O P Q R S U V W 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z );
my %key_ct = {};
# Find that char in @abc holding same position as a given Unicode char present in @zam.
sub zam_abc {
my ($a) = @_;
my $i = 0;
for my $z ( @zam ) {
last if $a eq $z;
++$i;
}
return $abc[$i];
}
# Translate an Eo word into sortable gibberish that works with Perl.
sub sortable_key {
my $a = shift;
my $z = '';
for my $x (split //, $a) {
$z .= zam_abc($x);
}
if ( exists $key_ct{$a} ) {
$key_ct{$a} += 1; # Inc tally.
} else {
$key_ct{$a} = 0; # Init new tally.
};
return $z . sprintf("_%02d", $key_ct{$a});
}
# Print to file a list of overlapping, plural keys.
sub list_plural_keys {
open my $fh_out_2, '>', "$dir_output/keys_plural.txt" or die $!;
for my $key (@keys) {
if ($key_ct{$key} > 0) {
print $fh_out_2 $key . " = $key_ct{$key}\n";
}
}
}
# Parse out input vortaro for definitions.
sub parse_file {
my ($fh_in, $fh_out_1) = @_;
my %def_links;
my @def_keys;
while (<$fh_in>)
{
next unless $_ =~ m/b/;
my $def_line = my $def_word = $_;
my $def_word = get_word($def_line);
my $def_key = sortable_key($def_word);
$def_links{$def_key} = $def_line;
push @def_keys, $def_key;
}
@def_keys = sort @def_keys;
for (@def_keys) {
print $fh_out_1 "\n" . $def_links{$_};
}
}
# Extract the defined word for use as a key.
sub get_word {
my $str = shift;
# $str =~ s/ //;
$str =~ s/$regex_1//;
$str =~ s/$regex_2//;
return $str;
}
chdir cwd();
# Find and process all vortaro files.
opendir $dh, $dir_input or die "Oops! Cannot open $dir_input directory.\n";
my @dir_list = readdir $dh;
closedir $dh;
for (@dir_list) {
next unless $_ =~ /$regex_0\.txt$/;
next if $_ =~ /^new_/;
push @file_list, $_;
}
# Merge plural vortaroj into single, interleaved vortaro.
# Merged vortaro may contain duplicate entries.
# Gives second output file listing those duplicate entries.
for my $file_in (@file_list) {
my ($fh_in, $fh_out_1);
if (open $fh_in, '<:encoding(UTF-8)', $file_in) {
my $file_out = 'new_' . $file_in;
if (open $fh_out_1, '>', $file_out) {
} else {
print "Oops! Can't write to '$file_out'.\n";
}
print "Busy parsing '$file_in' ... \n";
parse_file($fh_in, $fh_out_1);
print "Output = '$file_out' \n\n";
close $fh_in;
close $fh_out_1;
} else {
print "Oops! Can't read from '$file_in'.\n";
}
list_plural_keys($fh_out_2);
print "All done.\n";
}
__END__
RegEx Puzzle Area
Mazirien la Magiisto
skarlat/o Brilega sangoruĝa koloro.