in reply to Re^3: Regex Parsing Style
in thread Regex Parsing Style
This is still a paired-down version of my actual script, but it more accurately represents what I'm really doing: counting characters.
use strict; use warnings; my %CONTROL_CODE = ( '\t' => 0x09, '\n' => 0x0a, '\f' => 0x0c, '\r' => 0x0d, ); my %character_count_by; while (<>) { chomp; pos = 0; TOKEN: while (1) { # Literal character if (m/\G ([^\\]) /gcx) { $character_count_by{ord $1}++; next TOKEN; } # Universal Character Name if (m/\G \\u([0-9a-f]{4}) /gcx) { $character_count_by{hex $1}++; next TOKEN; } # Literal character escape sequence if (m/\G \\(["^\\]) /gcx) { $character_count_by{ord $1}++; next TOKEN; } # Control code escape sequence if (m/\G (\\[tnfr]) /gcx) { $character_count_by{$CONTROL_CODE{$1}}++; next TOKEN; } # End of string if (m/\G \z /gcx) { last TOKEN; } # Invalid character die "Invalid character on line $. of file $ARGV\n"; } } for my $code (sort { $a <=> $b } keys %character_count_by) { printf "U+%04x\t%d\n", $code, $character_count_by{$code}; }
UPDATE: Changed \Z to \z and updated error message of event that can never happen.
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^5: Regex Parsing Style
by ikegami (Patriarch) on Nov 26, 2010 at 06:50 UTC | |
by Jim (Curate) on Nov 26, 2010 at 15:57 UTC | |
by ikegami (Patriarch) on Nov 26, 2010 at 17:15 UTC | |
by Jim (Curate) on Nov 26, 2010 at 17:53 UTC |