#!/usr/bin/perl # Program reads in any and all files named "Vortaro_.*" from the current directory. # It assumes each file to contain one or more jumbled dictionaries in Esperanto from # some EPub at Eldonejo Mistera Sturno. For each file read in, a new file "new_Vortaro_.*\.txt" # is written, no longer jumbled but fully alphabetized. It may, however, have some duplicates. # This is because the word was defined plurally in the jumble. A list of these duplicates will # be listed in a file "keys_plural.txt" as a guide to manual sorting out. use utf8; use open qw(:std :utf8); use Cwd; ################### # BEGIN USER DEFS # ################### # Where things are. my $dir_input = './'; my $dir_output = './'; # Regular Expression variables my $regex_0 = qr(Vortaro_.*); # Input files to parse my $regex_1 = qr(

); # Left boundary of definition word. my $regex_2 = qr(<.*); # Right boundary of definition word. ################# # END USER DEFS # ################# my @file_list = (); # The array of all characters in Esperanto which are to factor in alphabetic sorting. my @zam = qw( / A B C Ĉ D E F G Ĝ H Ĥ I J Ĵ K L M N O P R S Ŝ T U Ŭ V Z a á b c ĉ d e é f g ĝ h ĥ i ï j ĵ k l m n o ó p r s ŝ t u ŭ ú v z ); # An equal-or-larger array corresponding to the above but in ASCII sorting order. # Duplicates exist to make the sorting not case sensitive. my @abc = qw( / 0 2 3 4 5 6 8 9 A B C D F G H I J K L N O P Q R S U V W 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z ); my %key_ct = {}; # Find that char in @abc holding same position as a given Unicode char present in @zam. sub zam_abc { my ($a) = @_; my $i = 0; for my $z ( @zam ) { last if $a eq $z; ++$i; } return $abc[$i]; } # Translate an Eo word into sortable gibberish that works with Perl. sub sortable_key { my $a = shift; my $z = ''; for my $x (split //, $a) { $z .= zam_abc($x); } if ( exists $key_ct{$a} ) { $key_ct{$a} += 1; # Inc tally. } else { $key_ct{$a} = 0; # Init new tally. }; return $z . sprintf("_%02d", $key_ct{$a}); } # Print to file a list of overlapping, plural keys. sub list_plural_keys { open my $fh_out_2, '>', "$dir_output/keys_plural.txt" or die $!; for my $key (@keys) { if ($key_ct{$key} > 0) { print $fh_out_2 $key . " = $key_ct{$key}\n"; } } } # Parse out input vortaro for definitions. sub parse_file { my ($fh_in, $fh_out_1) = @_; my %def_links; my @def_keys; while (<$fh_in>) { next unless $_ =~ m/b/; my $def_line = my $def_word = $_; my $def_word = get_word($def_line); my $def_key = sortable_key($def_word); $def_links{$def_key} = $def_line; push @def_keys, $def_key; } @def_keys = sort @def_keys; for (@def_keys) { print $fh_out_1 "\n" . $def_links{$_}; } } # Extract the defined word for use as a key. sub get_word { my $str = shift; # $str =~ s/

//; $str =~ s/$regex_1//; $str =~ s/$regex_2//; return $str; } chdir cwd(); # Find and process all vortaro files. opendir $dh, $dir_input or die "Oops! Cannot open $dir_input directory.\n"; my @dir_list = readdir $dh; closedir $dh; for (@dir_list) { next unless $_ =~ /$regex_0\.txt$/; next if $_ =~ /^new_/; push @file_list, $_; } # Merge plural vortaroj into single, interleaved vortaro. # Merged vortaro may contain duplicate entries. # Gives second output file listing those duplicate entries. for my $file_in (@file_list) { my ($fh_in, $fh_out_1); if (open $fh_in, '<:encoding(UTF-8)', $file_in) { my $file_out = 'new_' . $file_in; if (open $fh_out_1, '>', $file_out) { } else { print "Oops! Can't write to '$file_out'.\n"; } print "Busy parsing '$file_in' ... \n"; parse_file($fh_in, $fh_out_1); print "Output = '$file_out' \n\n"; close $fh_in; close $fh_out_1; } else { print "Oops! Can't read from '$file_in'.\n"; } list_plural_keys($fh_out_2); print "All done.\n"; } __END__ RegEx Puzzle Area Mazirien la Magiisto

skarlat/o Brilega sangoruĝa koloro.

Domo de'l Se
delekt/i (tr) Tre plezurigi.