Re: Parsing SExpressions (lisp)

You could use Data::SExpressions and learning Lisp lingo is definitely worth the effort if you plan on doing a lot of parsing. However, I think your first intuitions a while back about using classes was probably right. Here's a fairly simple parser, (in under 300 lines), that should do the basic extraction of commands and binding them to classes that can execute the command, tell you what line number it was found on, how deeply nested it is. And of course you could walk the nested commands do further analysis

I've done a "fill in the blank here" example below. The code works but does pointlss things when it executes the script (basically just reports on the command and its arguments). Please feel free to ask questions. The point is to understand it and be able to adapt it do your needs, but using recursion and classes is easier to show than talk about so here it is:

use strict;
use warnings;

package ScriptParser;  #don't pollute global namespace
use Carp;

# atom token is any run that doesn't include whitespace
# opening or closing parenthesis - YMMV depending on syntax
# details
my $RE_TOKEN_ATOM = qr/[^\(\)\s]+/o;

#==================================================================
# Base class for all commands
#==================================================================

{
  package AnyCmd;

  my $CLASS = __PACKAGE__;

  sub new {
    my ($sClass, $sCmd, $iLine, $iNested) = @_;
    #Note: depth and line is for convenience in formatting and
    #printing error messages
    my $self = {cmd => $sCmd
                , depth => $iNested
                , line => $iLine
                , args => []
               };
    return bless($self, $sClass);
  }

  sub getArgs {
    my $self = shift;  #my $self=$_[0] also OK here
    return $self->{args};
  }

  sub getCmd {
    my $self = shift;  #my $self=$_[0] also OK here
    return $self->{cmd};
  }

  sub getDepth {
    my $self = shift;  #my $self=$_[0] also OK here
    return $self->{depth};
  }

  sub getLine {
    my $self = shift;  #my $self=$_[0] also OK here
    return $self->{line};
  }

  sub pushArg {
    my ($self, $xArg) = @_;
    push @{$self->getArgs()}, $xArg;
  }

  sub resolveArgs {
    my $self = shift;
    my $aArgs = $self->getArgs();

    #an argument can be either a simple value or a variable whose
    #value needs to be looked up or another command. What we want
    #is the value after all of that is done.

    my $aResolved = [];
    foreach my $sArg (@$aArgs) {
      # assume anything that is a subclass of this class is a
      # command to be executed or a variable to look up.
      if (ref($sArg) && $sArg->isa($CLASS)) {
        $sArg = $sArg->execute();
      }
      push @$aResolved, $sArg;
    }
    return $aResolved;
  }

  sub execute {
    my $self = shift;  #my $self=$_[0] also OK here
    my $sCmd = $self->getCmd();
    my $aResolved = $self->resolveArgs();
    return $self->_p_doExecute($sCmd, $aResolved);
  }

  #-----------------------------------------------------------
  # this method is meant to be overridden by each subclass
  # _p_ is just my funky convention for marking protected
  # member that aren't meant to be accessed by any sort of
  # public API, just subclasses

  sub _p_doExecute {
    my ($self, $sCmd, $aResolved) = @_;

    #implementation goes here. each subclass has a different
    #implementation, but here is a dummy one:

    #delimiting everything in case one of the args begins or ends
    #with whitespace.  I'm assuming that '(' is not part of any
    #value, but if there is some other delimiter use that instead
    #mainly just wanted to avoid messing escapes or quoting inside
    #each arg.
    my $sArgList = scalar(@$aResolved)
      ? '(' . join(') (', map { defined($_)?$_:'undef' } @$aResolved) 
+. ')'
      : '';
    print STDERR "Executing command: $sCmd($sArgList)\n";
    return "$sCmd executed";
  }

} #end AnyCmd package

#==================================================================
# Actual Cmd definitions here
#==================================================================

# HERE you would define subclasses for predefined commands
# e.g. global, sleep, camera control

{
  package GlobalCmd;
  our @ISA=qw(AnyCmd);

  sub _p_doExecute {
    my ($self, $sCmd, $aResolved) = @_;
    my ($sName, $sType, $sValue) = @$aResolved;
    printf STDERR "Declaring variable: name=<%s>, type=<%s>, value=<%s
+>\n"
      , map {defined($_) ? $_ : 'undef' } ($sName, $sType, $sValue);
  }
}

{
  package SleepCmd;
  our @ISA=qw(AnyCmd);

  sub _p_doExecute {
    my ($self, $sCmd, $aResolved) = @_;
    printf STDERR "Sleeping for %d seconds\n", $aResolved->[0];
  }
}

{
  package UserDefinedCmd;
  our @ISA=qw(AnyCmd);
}


#----------------------------------------------------------

# HERE you would fill up the hash with defined commands.
# the hash makes it easy to check which commands are
# recognized and which are not. Also it maps the command
# to the class with the proper implementation of ->execute()

my %DEFINED_CMDS = ( global => 'GlobalCmd'
                     , sleep => 'SleepCmd'
                   );
my $USER_DEFINED_CMD_CLASS='UserDefinedCmd';

#==================================================================
# And now for the parsing and execution....
#==================================================================

#NOTE: this is written to read from <DATA> but in real code
# you would probably want to pass a file handle to 
# parseScript() and even to runScript() as well.

sub runScript {
  my $aCmds = parseScript();
  if (! defined($aCmds)) {
    carp("Couldn't execute script: too many errors!!!");
    return 0;
  }

  foreach my $oCmd (@$aCmds) {
    $oCmd->execute();
  }
}


#----------------------------------------------------------

sub parseScript {
  my $aTopLevelCmds=[];
  my $aNested=[];
  my $iLine=0;
  my $iErrors=0;

 LINE:
  while (my $line = <DATA>) {
    chomp $line;
    $iLine++;

    # the o flag at the end tells Perl that $RE_TOKEN_ATOM will
    # not change so once its parsed and compiled it, it can
    # just reuse the result.  DO NOT use if you have a variable
    # that changes value in your regexp

    # NOTE: this loop assumes each atom must be on a single line
    # i.e.  nothing like  "sle\nep" in the script.

    while ($line =~ /\s*(;)           # $1 = start of comment
                     |\s*(\()?\s*     # $2 = start of nested cmd
                     ($RE_TOKEN_ATOM) # $3 = capture the token
                     |(\)\s*)         # $4 = end of nested cmd
                     /gox) {

      # if $1 (comment prefix) is defined, ditch this token and the
      # rest of the line: its just comments;
      next LINE if (defined($1));

      # not a comment - process $2=begin, $3=atom, $4=close
      $iErrors += processTokens($iLine, $2,$3,$4,$aTopLevelCmds, $aNes
+ted);

    } #end parsing of line
  } #end reading input


  my $iIncomplete = scalar(@$aNested);
  if ($iIncomplete) {
    $iErrors += $iIncomplete;
    # this is why we stored the line number with each command: it help
+s with
    # debugging problems with the script

    carp(
         "Unmatched opening parenthesis!!!! for commands starting at l
+ines:"
         . join(',', (map { $_->getLine() } @$aNested))
         . "\n"
        );
  }
  return $iErrors > 0 ? undef : $aTopLevelCmds;
}

#----------------------------------------------------------
# handle one token retrieved by $line

sub processTokens {
  my ($iLine, $bStart, $sAtom, $bEnd, $aTopLevelCmds, $aNested) = @_;
  my $iNested = scalar(@$aNested);
  my $iError = 0;

  #if ($sAtom) {  #whoops this will miss "0"
  if (defined($sAtom)) {
    if ($bStart) {
      # start a new command
      my $sCmdClass = $DEFINED_CMDS{$sAtom};
      $sCmdClass = $USER_DEFINED_CMD_CLASS unless defined($sCmdClass);

      my $oCmd = $sCmdClass->new($sAtom, $iLine, $iNested);
      push @$aNested, $oCmd;

    } elsif (scalar(@$aNested)) {
      # add an arg to the current command
      my $oCurrentCmd = $aNested->[-1];
        $oCurrentCmd->pushArg($sAtom);
    } else {
      $iError=1;
      carp("We've got a problem: atom <$sAtom> outside "
           ."parenthesis!!! in line $iLine\n");
    }

  } elsif ($bEnd) {
    # complete a command
    if ($iNested) {
      #we're done with the current command
      my $oCompletedCmd = pop @$aNested;
      if ($iNested == 1) {
        #top level command, add it to the list
        push @$aTopLevelCmds, $oCompletedCmd;
      } else {
        #add it to the argument list of the command
        #surrounding it
        my $oCurrentCmd = $aNested->[-1];
        $oCurrentCmd->pushArg($oCompletedCmd);
      }
    } else {
      $iError = 1;
      carp("We've got a problem: unmatched closing parenthesis "
           ."in line $iLine\n");
    }
  }
  return $iError;
}

#----------------------------------------------------------

runScript();

# Data embedded in a file - a nice trick for quick and dirty testing
# replace stuff below __DATA__ with the script you want to parse.

__DATA__
;;GAME GLOBALS------------------------------------------------

(global short rounds_completed 0)
(global short lives 999)
(global trigger_volume battlefield_volume deathbystupidness_1)
(global short restart_battlefield 0)
(global boolean life_titles true)
(global short destroyall 0)
[download]

Update: added content to __DATA__ section and fixed line reading if ($sAtom) to if (defined($sAtom))

Comment on Re: Parsing SExpressions (lisp) Select or Download Code