in reply to Parsing Emacs Lisp sexpr?
> I guess whipping up a new parser is not hard...
Not as easy as it seems. I probably started from a wrong end, the logic in the actions collapse and tuple should be handled by the grammar itself, but hey: it's slow, but it works for the input.
#! /usr/bin/perl
use warnings;
use strict;
use Marpa::R2;
use open ':std', ':encoding(UTF-8)';
my $dsl = << '__DSL__';
:default ::= action => ::first
lexeme default = latm => 1
PackageList ::= ('(1') Packages (')')
Package ::= ('(') Name ('. [') Date Deps Desc Format Detail ('])')
action => assemble
Packages ::= Package Packages action => merge
| Package action => [values]
Date ::= List
Deps ::= List
| Nil
Desc ::= String
Format ::= Name
Detail ::= List
| Nil
Dot ::= dot action => dot
Nil ::= 'nil' action => nil
String ::= ('"') Quoteds ('"')
Quoteds ::= Quoted Quoteds action => concat
| Quoted
Keyed ::= (':') Name Elements action => tuple
List ::= ('(') Elements (')') action => collapse
Elements ::= Element Elements action => merge
| Element action => [values]
Element ::= number
|| Nil
| String
| Keyed
| List
| Dot
|| Name
Name ::= id_chars
Quoted ::= no_qq
| escaped_qq action => unescape
:discard ~ whitespace
whitespace ~ [\s]+
id_chars ~ [-+@\w]+
no_qq ~ [^"]
escaped_qq ~ '\"'
number ~ [0-9]+
dot ~ '.'
__DSL__
sub nil { undef }
sub dot { \'.' }
sub unescape { substr $_[1], 1 }
sub concat { $_[1] . $_[2] }
sub merge { [ $_[1], @{ $_[2] } ] }
sub assemble {
+{ name => $_[1],
date => $_[2],
dependencies => $_[3],
description => $_[4],
format => $_[5],
details => { map @{ $_->[0] }, @{ $_[6] } }
}
}
sub collapse {
if (3 == @{ $_[1] } && 'SCALAR' eq ref $_[1][1]) {
+{ $_[1][0] // "" => $_[1][2] }
} else {
$_[1]
}
}
sub tuple {
[ ('SCALAR' eq ref $_[2][0]) ? ($_[1] => $_[2][1])
: ('SCALAR' eq ref $_[2][1]) ? ($_[1] => { ($_[2][0] // "") => $
+_[2][2] })
: ($_[1] => $_[2])
]
}
my $grammar = 'Marpa::R2::Scanless::G'->new({ source => \$ds
+l });
my $recce = 'Marpa::R2::Scanless::R'->new({ grammar => $gra
+mmar,
semantics_package => 'ma
+in' });
my $input = do { local $/; <> };
$recce->read(\$input) or die;
my $value = ${ $recce->value };
use Data::Dumper;
print Dumper($_) for @$value;
map{substr$_->[0],$_->[1]||0,1}[\*||{},3],[[]],[ref qr-1,-,-1],[{}],[sub{}^*ARGV,3]
Re^2: Parsing Emacs Lisp sexpr?
by perlancar (Hermit) on Apr 09, 2020 at 03:00 UTC
|
Nice work! I wonder why you opt to parse this format specifically instead of the generic lisp format though.
As for the speed, it's actually rather on-par with Data::SExpression, which uses Parse::Yapp. I commented out the dumping and then:
% time perl 11115197.pl archive-contents
real 0m7.449s
user 0m7.036s
sys 0m0.413s
% time perl -MFile::Slurper=read_text -MData::SExpression -E'$ds=Data::SExpression->new; ($sexp, $text) = $ds->read(read_text "archive-contents.2");'
real 0m5.411s
user 0m5.386s
sys 0m0.025s
archive-contents.2 is just the original file with replaced with ( ), and then the problematic @ atom replaced by "@".
Perl regex or Regexp::Grammars will probably be several times faster.
| [reply] |
|
> I wonder why you opt to parse this format specifically instead of the generic lisp format though.
As I said, I started from a wrong end. I'm kind of busy working from home and staying there with a wife and three children, so I didn't have time to fix it immediately. Here's a much simpler and faster version, which parses melpa's archive-contents in less than 5 seconds on my machine:
#! /usr/bin/perl
use warnings;
use strict;
use Marpa::R2;
my $dsl = << '__DSL__';
:default ::= action => ::first
lexeme default = latm => 1
List ::= ('(') Elements (')')
Elements ::= Element+ action => [values]
Element ::= List
| Vector
| Atom
| String
| Pair
Vector ::= ('[') Elements (']')
Atom ::= identifier
String ::= ('"') Quoteds ('"')
Quoteds ::= Quoteds Quoted action => concat
| Quoted
Quoted ::= backslash
|| qq
|| plain
Pair ::= Element (dot) Element action => pair
:discard ~ whitespace
whitespace ~ [\s]+
dot ~ '.'
backslash ~ '\\'
qq ~ '\"'
identifier ~ [-\w@:+]+
plain ~ [^\\"]+
__DSL__
sub concat { $_[1] . $_[2] }
sub pair { +{ $_[1] => $_[2] } }
my $grammar = 'Marpa::R2::Scanless::G'->new({source => \$dsl});
my $lisp = do { local $/; <> };
my $value_ref = $grammar->parse(\$lisp, {semantics_package => 'main'})
+;
use Data::Dumper; print Dumper $value_ref;
map{substr$_->[0],$_->[1]||0,1}[\*||{},3],[[]],[ref qr-1,-,-1],[{}],[sub{}^*ARGV,3]
| [reply] [d/l] [select] |
|
Thanks for this, choroba. It finishes in about 2 seconds on my computer, pretty impressive. I'll see what I can use to improve my SExpression::Decode::Marpa.
| [reply] |
|
% time perl -Ilib -MSExpression::Decode::Marpa=from_sexp -MFile::Slurper=read_text -E'from_sexp(read_text "archive-contents")'
real 0m4.023s
user 0m3.818s
sys 0m0.204s
| [reply] |
|
Anyhow, I tried hacking a regex-based parser here. It's "working" with some problem: 1) segmentation fault for larger data, indicating a leak somewhere. 2) parsing failure when e.g. the NUMBER rule fails to match and it matches ATOM instead, e.g. in this sexp: (1a) which fails, but (1) and (a) succeed.
| [reply] |
|
|