use strict; use warnings; my %CONTROL_CODE = ( '\t' => 0x09, '\n' => 0x0a, '\f' => 0x0c, '\r' => 0x0d, ); my %character_count_by; while (<>) { chomp; pos = 0; TOKEN: while (1) { # Literal character if (m/\G ([^\\]) /gcx) { $character_count_by{ord $1}++; next TOKEN; } # Universal Character Name if (m/\G \\u([0-9a-f]{4}) /gcx) { $character_count_by{hex $1}++; next TOKEN; } # Literal character escape sequence if (m/\G \\(["^\\]) /gcx) { $character_count_by{ord $1}++; next TOKEN; } # Control code escape sequence if (m/\G (\\[tnfr]) /gcx) { $character_count_by{$CONTROL_CODE{$1}}++; next TOKEN; } # End of string if (m/\G \z /gcx) { last TOKEN; } # Invalid character die "Invalid character on line $. of file $ARGV\n"; } } for my $code (sort { $a <=> $b } keys %character_count_by) { printf "U+%04x\t%d\n", $code, $character_count_by{$code}; }