my %scripts;
foreach (@lines) {
my $script =
m/^\p{Script=Latin}*$/ ? "Latin" :
m/^\p{Script=Cyrillic}*$/ ? "Cyrillic" :
m/^\p{Script=Han}*$/ ? "Han" :
# ...
"(unknown)";
$scripts{$script}++;
}
####
my %scripts;
LINE: foreach my $line (@lines) {
foreach my $script (@known_scripts) {
next unless $line =~ m/^\p{Script=$script}*$/;
$scripts{$script}++;
next LINE;
}
$scripts{'(unknown)'}++;
}
####
my @known_scripts = (
"Arabic", "Armenian", "Avestan",
"Balinese", "Bamum", "Batak", "Bengali", "Bopomofo", "Brahmi", "Braille",
"Buginese", "Buhid",
"Canadian_Aboriginal", "Carian", "Chakma", "Cham", "Cherokee",
"Coptic", "Cuneiform", "Cypriot", "Cyrillic",
# ...
);
####
Can't find Unicode property definition "Script=Chakma" at (...) line (...)
####
$ for i in 8 10 12 14 16 18 20; do perl scripts.pl scripts-5$i.txt >5$i.lst; done
$ for i in 8 10 12 14 16 18; do diff --unchanged-line-format= --new-line-format=%L 5$i.lst 5$((i+2)).lst >5$((i+2)).new; done
$
####
#!/usr/bin/perl
use strict;
use warnings;
use feature qw/say/;
my %scripts;
while(<>) {
next unless m/; ([A-Za-z_]*) #/;
$scripts{$1}++;
}
$, = "\n";
say sort { $a cmp $b } map { $_ = ucfirst lc; $_ =~ s/(?<=_)(.)/uc $1/ge; qq/"$_"/ } keys %scripts;
####
# New Unicode scripts added in Perl 5.xx
my %uniscripts = (
'8' => [
"Arabic", "Armenian", "Bengali", "Bopomofo", "Buhid",
"Canadian_Aboriginal", "Cherokee", "Cyrillic", "Deseret",
"Devanagari", "Ethiopic", "Georgian", "Gothic", "Greek", "Gujarati",
"Gurmukhi", "Han", "Hangul", "Hanunoo", "Hebrew", "Hiragana",
"Inherited", "Kannada", "Katakana", "Khmer", "Lao", "Latin",
"Malayalam", "Mongolian", "Myanmar", "Ogham", "Old_Italic", "Oriya",
"Runic", "Sinhala", "Syriac", "Tagalog", "Tagbanwa", "Tamil",
"Telugu", "Thaana", "Thai", "Tibetan", "Yi"
],
'10' => [
"Balinese", "Braille", "Buginese", "Common", "Coptic", "Cuneiform",
"Cypriot", "Glagolitic", "Kharoshthi", "Limbu", "Linear_B",
"New_Tai_Lue", "Nko", "Old_Persian", "Osmanya", "Phags_Pa",
"Phoenician", "Shavian", "Syloti_Nagri", "Tai_Le", "Tifinagh",
"Ugaritic"
],
'12' => [
"Avestan", "Bamum", "Carian", "Cham", "Egyptian_Hieroglyphs",
"Imperial_Aramaic", "Inscriptional_Pahlavi",
"Inscriptional_Parthian", "Javanese", "Kaithi", "Kayah_Li",
"Lepcha", "Lisu", "Lycian", "Lydian", "Meetei_Mayek", "Ol_Chiki",
"Old_South_Arabian", "Old_Turkic", "Rejang", "Samaritan",
"Saurashtra", "Sundanese", "Tai_Tham", "Tai_Viet", "Vai"
],
'14' => [
"Batak", "Brahmi", "Mandaic"
],
'16' => [
"Chakma", "Meroitic_Cursive", "Meroitic_Hieroglyphs", "Miao",
"Sharada", "Sora_Sompeng", "Takri"
],
'18' => [
],
'20' => [
],
);
(my $ver = $^V) =~ s/^v5\.(\d+)\.\d+$/$1/;
my @known_scripts;
foreach (keys %uniscripts) {
next if $ver < $_;
push @known_scripts, @{ $uniscripts{$_} };
}
print STDERR "Running on Perl $^V, ", scalar @known_scripts, " scripts known.\n";
####
# tentative!
'22' => [
"Bassa_Vah", "Caucasian_Albanian", "Duployan", "Elbasan", "Grantha",
"Khojki", "Khudawadi", "Linear_A", "Mahajani", "Manichaean",
"Mende_Kikakui", "Modi", "Mro", "Nabataean", "Old_North_Arabian",
"Old_Permic", "Pahawh_Hmong", "Palmyrene", "Pau_Cin_Hau",
"Psalter_Pahlavi", "Siddham", "Tirhuta", "Warang_Citi"
],
####
print "Found " . scalar keys(%scripts) . " scripts:\n";
print "\t$_: " , $scripts{$_}, " line(s)\n" foreach(sort { $a cmp $b } keys %scripts);
####
Running on Perl v5.14.2, 95 scripts known.
Found 18 scripts:
Arabic: 21 line(s)
Bengali: 2 line(s)
Cyrillic: 12 line(s)
Devanagari: 3 line(s)
Georgian: 1 line(s)
Greek: 1 line(s)
Gujarati: 1 line(s)
Gurmukhi: 1 line(s)
Han: 29 line(s)
Hangul: 3 line(s)
Hebrew: 1 line(s)
Hiragana: 1 line(s)
Katakana: 1 line(s)
Latin: 647 line(s)
Sinhala: 1 line(s)
Tamil: 4 line(s)
Telugu: 1 line(s)
Thai: 1 line(s)