use strict; use warnings; use English qw( -no_match_vars ); my $TOKEN_PATTERN = qr{ ([^\\]) # 1 Literal character (g) | \\u([0-9a-f]{4}) # 2 Universal Character Name (\u263a) | \\(["^\\]) # 3 Literal character escape sequence (\") | \\([tnfr]) # 4 Control code escape sequence (\n) }x; my %CONTROL_CODE = ( t => 0x09, n => 0x0a, f => 0x0c, r => 0x0d, ); while (my $line = <>) { chomp $line; while ($line =~ m/$TOKEN_PATTERN/g) { my $token = $LAST_PAREN_MATCH; # Decode tokens... my $code = defined $1 ? ord $token : defined $2 ? hex $token : defined $3 ? ord $token : defined $4 ? $CONTROL_CODE{$token} : undef ; printf "U+%04x\n", $code if defined $code; } }