I started the work based on that regexp. Completely untested. Like that regexp, my code is for C (not C++), so some changes are required. Hopefully I got all the different types of escapes, but I didn't check any documentation.
sub handle_token {
my ($type, $value, $pos) = @_;
return if $type eq 'comment';
return if $value !~ /byFoo/;
print("Found byFoo in $type starting at byte $pos\n");
}
# ^ User code
# v Lexer code
my %str_escapes = (
t => chr(0x09),
n => chr(0x0A),
r => chr(0x0D),
...
);
sub handle_char_escapes {
my ($s, $pos) = @_;
for ($s) {
s/\\([a-wy-z])/
if (exists($str_escapes{$1})) {
$str_escapes{$1}
} else {
warn("Unrecognized escape sequence \"\\$1\"\n");
$1
}
/eg;
s/\([0-9]+)/
die("Bad input: Octal escape sequence too big in string at po
+s $pos\n");
if length($1) > 3 || oct($1) > 255;
chr(oct($1))
/eg;
s/\x([0-9a-fA-F]+)/
die("Bad input: Hex escape sequence too big in string at pos
+$pos\n");
if length($1) > 2 || oct("0x$1") > 255;
chr(hex("0x$1"))
/eg;
s/\\(.)/$1/sg;
}
return $s;
}
sub handle_comment {
my ($raw_comment, $pos) = @_;
handle_token('comment', substr($raw_comment, 2, -2), $pos);
}
sub handle_string {
my ($raw_string, $pos) = @_;
handle_token(
'string',
handle_char_escapes(substr($raw_string, 1, -1), $pos),
$pos
);
}
sub handle_char {
my ($raw_char, $pos) = @_;
handle_token(
'char',
handle_char_escapes(substr($raw_char, 1, -1), $pos),
$pos
);
}
sub handle_code {
handle_token('comment', @_);
}
sub lex {
for ($_[0]) {
/ \G ( \/\*[^*]*\*+(?:[^\/*][^*]*\*+)*\/ ) /xgc && do { handle_c
+omment("$1", $-[0]); redo };
/ \G ( "(?:\\.|[^"\\]*)" ) /xgc && do { handle_s
+tring ("$1", $-[0]); redo };
/ \G ( '(?:\\.|[^'\\])*' ) /xgc && do { handle_c
+har ("$1", $-[0]); redo };
/ \G ( .[^/"'\\]* ) /xgc && do { handle_c
+ode ("$1", $-[0]); redo };
/ \G \z / && last;
die("Bad input\n");
}
}
# ^ Lexer code
# v User code
lex($c_source);
|