This is still a paired-down version of my actual script, but it more accurately represents what I'm really doing: counting characters.
use strict;
use warnings;
my %CONTROL_CODE = (
'\t' => 0x09,
'\n' => 0x0a,
'\f' => 0x0c,
'\r' => 0x0d,
);
my %character_count_by;
while (<>) {
chomp;
pos = 0;
TOKEN:
while (1) {
# Literal character
if (m/\G ([^\\]) /gcx) {
$character_count_by{ord $1}++;
next TOKEN;
}
# Universal Character Name
if (m/\G \\u([0-9a-f]{4}) /gcx) {
$character_count_by{hex $1}++;
next TOKEN;
}
# Literal character escape sequence
if (m/\G \\(["^\\]) /gcx) {
$character_count_by{ord $1}++;
next TOKEN;
}
# Control code escape sequence
if (m/\G (\\[tnfr]) /gcx) {
$character_count_by{$CONTROL_CODE{$1}}++;
next TOKEN;
}
# End of string
if (m/\G \z /gcx) {
last TOKEN;
}
# Invalid character
die "Invalid character on line $. of file $ARGV\n";
}
}
for my $code (sort { $a <=> $b } keys %character_count_by) {
printf "U+%04x\t%d\n", $code, $character_count_by{$code};
}
UPDATE: Changed \Z to \z and updated error message of event that can never happen. |