comment on

So folks,

today I need to makes some sense of C++ files. I will need to parse out function signatures, and I have tried this with regex before, it gets messy especially around templates. Now a similar requirement has reared it's head so step one, lex the code. without further ado here is my attempt at lexing C++. The Lexer is called with an open file handle to a C++ source file. This is lexed into an array of tokens, that is then handed on to the parser.

What do you think, is this going to give me a nice labeled stream and make parsing a dream, or am I stumbling into know gotchas? Does it qualify as cool?

sub Lex {
    my $input = shift; # Get file handle to a C++ file
    my @tokens;        # This will contain the tokenised file ready fo
+r our parser
    my @longPatterns = (
        ['Comment'   => qr|//.*|           ],
        ['Directive' => qr|^\s*#define.*|  ],
        ['Directive' => qr|^\s*#elif.*|    ],
        ['Directive' => qr|^\s*#else.*|    ],
        ['Directive' => qr|^\s*#error.*|   ],
        ['Directive' => qr|^\s*#endif.*|   ],
        ['Directive' => qr|^\s*#if.*|      ],
        ['Directive' => qr|^\s*#ifdef.*|   ],
        ['Directive' => qr|^\s*#ifndef.*|  ],
        ['Directive' => qr|^\s*#include.*| ],
        ['Directive' => qr|^\s*#line.*|    ],
        ['Directive' => qr|^\s*#undef.*|   ],
        ['Directive' => qr|^\s*#pragma.*|  ],
    );
    my @reserved = qw(
        alignas alignof and and_eq asm atomic_cancel atomic_commit ato
+mic_noexcept auto bitand bitor
        bool break case catch char char16_t char32_t class compl conce
+pt const constexpr const_cast
        continue co_await co_return co_yield decltype default delete d
+o double dynamic_cast else
        enum explicit export extern false float for friend goto if imp
+ort inline int long module
        mutable namespace new noexcept not not_eq nullptr operator or 
+or_eq private protected public
        register reinterpret_cast requires return short signed sizeof 
+static static_assert
        static_cast struct switch synchronized template this thread_lo
+cal throw true try typedef 
        typeid typename union unsigned using virtual void volatile wch
+ar_t while xor xor_eq
    );
    my @patterns = ( # Multi character patterns to lex out
        ['Number'     => qr/^\d[\.\d]*$/   ],
        ['Identifier' => qr/\w+/           ],
        ['dblColon'   => qr/(?<!:)::(?!:)/ ],
    );
    my %Character = ( # Single characters by name
        '(' => 'LeftParen',
        ')' => 'RightParen',
        '[' => 'LeftSquare',
        ']' => 'RightSquare',
        '{' => 'LeftCurly',
        '}' => 'RightCurly',
        '<' => 'LessThan',
        '>' => 'GreaterThan',
        '=' => 'Equal',
        '+' => 'Plus',
        '-' => 'Minus',
        '*' => 'Asterisk',
        '/' => 'Slash',
        '#' => 'Hash',
        '.' => 'Dot',
        ',' => 'Comma',
        ':' => 'Colon',
        ';' => 'Semicolon',
        "'" => 'SingleQuote',
        '"' => 'DoubleQuote',
        '|' => 'Pipe',
    );
    while (my $line = <$input>) {
        chomp $line;
        my $matched;
    for my $patt (@longPatterns) { # some to evaluate on the entire li
+ne
            if ($line =~ s|($patt->[1])|| ) {
            my $token = $1;
            print "$patt->[0]\t$token\n" if $debug;
        push @tokens, [$patt->[0], $token];
            }
    }
        print "got> $line\n" if $debug and $line =~/\S/;
    LABEL: for my $token (split /\b/, $line) { # now handle token at a
+ time
            $token =~ s/^\s+|\s+$//g; # Strip whitespace
        next unless $token;       # anything left?
            print "Lexing $token\n" if $debug;
        for my $word (@reserved) { # look for reserve words
                if ($word eq $token) { # A C++ reserve word, simples
                    print "reserved\t$token\n" if $debug;
            push @tokens, ['reserved', $token];
            next LABEL;
                }
            }
            for my $pat (@patterns) { # Try multi character patterns n
+ext
                if ($token =~ /$pat->[1]/) {
                    print "$pat->[0]\t$token\n" if $debug;
                    push @tokens, [$pat->[0], $token];
                    next LABEL
        }
            }
            unless ($matched) { # Didn't match multichar pattern, so h
+andle character at a time
                for my $char (split //, $token) {
                    print "Lexing by character $char\n" if $debug;
                    if (exists $Character{$char}) {
                        print "$Character{$char}\t$char\n" if $debug;
                        push @tokens, [$Character{$char}, $char];
            }
            else {
                        print "Failed to match $char\n";
                    }
                }
            }
        }        
        Parser(\@tokens)
    }
}
[download]

Cheers,
R.

Pereant, qui ante nos nostra dixerunt!

Update

More compiler directives added

In reply to Lexing C++ by Random_Walk

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.