castaway has asked for the wisdom of the Perl Monks concerning the following question:
.. and into hashes..
I wonder if any of the regexp gurus has a better idea how to do this than I do. I've looked at Text::ParseWords (and String ::ParseWords), and I don't think they'll be very helpfull, which is why I've tried it 'by hand' with a recursive function.
Input text is something like this:
({ 1, 2, "three", 0, ({ "internal", "array", 0, }), "end", })
And I've cooked up:
sub lpcarray_to_array
{
# Convert an lpc_array to a perl array
# Parameter: Object, LPCArray, Pointer to answer
my ($me, $larray, $ansp) = @_;
my @result = ();
print "lpcarray_to_array: $larray\n";
if($larray =~ /\({\s*(.*)/)
{
print "lpcarray_to_array: $1\n";
my @items = split(/,\s*/, $1);
while (@items)
{
$i = shift(@items);
if($i =~ /^\({(.*)/)
{
# Another array
my $passthrough = '';
unshift(@items, $1);
my $parsearray = '({' . join(', ', @items);
my @inarray = $me->lpcarray_to_array($parsearray,
\$passthrough);
if(defined($passthrough))
{
print "Got: $passthrough\n";
@items = split(/, \s*/, $passthrough);
}
$result[scalar(@result)] = \@inarray;
}
elsif($i =~/^\(\[(.*)/)
{
# A mapping
my $passthrough = '';
unshift(@items, $1);
my $parsehash = '([' . join(', ', @items);
my %inmapping = $me->lpcmapping_to_hash($parsehash,
\$passthrough)
+;
if(defined($passthrough))
{
@items = split(/, \s*/, $passthrough);
}
$result[scalar(@result)] = \%inmapping;
}
elsif($i =~ /\"(.*)\"/)
{
# A string
$result[scalar(@result)] = $1;
}
elsif($i =~ /(\d+)/)
{
# A number
$result[scalar(@result)] = $1;
}
elsif($i =~ /\s*(.*)\s*}\)/)
{
# end of the array
$result[scalar(@result)] = $1;
${$ansp} = join(', ', @items);
return @result;
}
else
{
# Something we don't recognise
# Muss belong to the other parser
print "Don't know $i in lpcarray\n";
${$ansp} = $i . ', ' . join(', ', @items);
}
}
}
${$ansp} = undef;
return @result;
}
The code for lpcmapping_to_hash looks similar, mappings look like this:
([ "key1": "value", "key2": ({ "value", "array", }) ])
and can also be included as part of an array.
I could also do it with Parse::RecDescent, but I think thats overdoing it a bit ;)
Update: Example was missing a ',', changed handling of $passthrough and the array-end '})'
Thanks to bart for pointing that out.
C.
•Re: Parsing Text into Arrays..
by merlyn (Sage) on Jan 20, 2003 at 14:57 UTC
|
Here's my take on it, just for comparison:
#!/usr/bin/perl
use warnings;
use strict;
$|++;
use Data::Dumper;
print Dumper parse_string(join "", <DATA>);
sub parse_string {
local $_ = shift;
parse_expression();
}
sub parse_expression {
parse_list() || parse_hash() || parse_item();
}
sub parse_list {
my $pos = pos;
{
last unless parse_left_paren();
last unless my $list = parse_comma_items();
last unless parse_right_paren();
return $list; # already a list reference
};
pos($pos);
return undef;
}
sub parse_left_paren {
/\G\s*\(/gc ? 1 : undef;
}
sub parse_comma_items {
my @result;
{
last unless my $item = parse_expression();
$item = $$item if ref $item eq "SCALAR";
push @result, $item;
redo if parse_comma();
}
return \@result;
}
sub parse_comma {
/\G\s*,/gc ? 1 : undef;
}
sub parse_right_paren {
/\G\s*\)/gc ? 1 : undef;
}
sub parse_hash {
my $pos = pos;
{
last unless parse_left_curly();
last unless my $list = parse_comma_items();
last unless parse_right_curly();
## list to hash sanity checks
## even length?
last unless @$list % 2 == 0;
## no reference keys?
last if grep { ref $list->[$_] } map { $_ * 2 } 0 .. $#$list / 2;
## ok, let it go
return {@$list}; # convert listref to hashref
};
pos($pos);
return undef;
}
sub parse_left_curly {
/\G\s*\{/gc ? 1 : undef;
}
sub parse_right_curly {
/\G\s*\}/gc ? 1 : undef;
}
sub parse_item {
parse_number() || parse_quoted_string();
}
sub parse_number {
/\G\s*(\d+)/gc ? \"$1" : undef;
}
sub parse_quoted_string {
/\G\s*\"([^\"]*)\"/gc ? \"$1" : undef;
}
__DATA__
{ "fred", { 2, { "barney", { 4 , 5}, 6, (7, 8)}}}
-- Randal L. Schwartz, Perl hacker
Be sure to read my standard disclaimer if this is a reply. | [reply] [d/l] |
|
Hmm, not to complain, but, where did the double brackets go?
{ "fred", { 2, { "barney", { 4 , 5}, 6, (7, 8)}}}
should be
({ "fred", ({ 2, ({ "barney", ({ 4 , 5}), 6, ({7, 8}))})})})
or something like that..
C.
| [reply] [d/l] |
|
| [reply] |
|
Re: Parsing Text into Arrays..
by Gilimanjaro (Hermit) on Jan 20, 2003 at 13:36 UTC
|
This may be a slightly evil solution, especially if you don't have a lot of control over where your input comes from, but I like to let perl do the work:
sub lpcarray_to_array {
my $lpc = shift;
$lpc =~ s/\(\{/\[/g;
$lpc =~ s/\}\)/\]/g;
return eval $lpc;
}
I must admit I haven't taken the time to examine your code (boss skulking around), but the layout of your lpc-array looked eerily like a plain perl anonymous array, except for brace-style. Maybe my function combined with some input validation can give you an elegant solution... | [reply] [d/l] |
|
Unfortunately that won't work if any string holds "({" or "})". You need a slightly more sophisticated pattern. Below is one way of writing it.
my $quote = qr/"[^"]*"/; # Naive
s{\G((?>.*?(?:$quote|(?=(\(\{|\}\)))))*?)\2}
{$1 . ($2 eq '(\{' ? '[' : ']')}eg;
You didn't have it in your pattern, and my pattern is based on your, but you should replace "})" with "]," if it shall become a list.
ihb | [reply] [d/l] [select] |
|
Wow, nice idea.. It's certainly short and to the point :)
C.
| [reply] |
|
| [reply] |
|
Re: Parsing Text into Arrays..
by Gilimanjaro (Hermit) on Jan 20, 2003 at 13:46 UTC
|
After some thought, I came up with this not recursive but iterative version, which seems to work... It uses the m//gc and \G regex functionality to walk through the source string... I like this solution, and it should be easily adjusted to use any delimeter...
#!/usr/bin/perl -w
use strict;
($\,$,)=("\n","\t");
use Data::Dumper;
my $string =
'({ 1, 2, "three", 0, ({ "internal", "array", 0, }) "end", })';
my @array = parse($string);
print Dumper \@array;
sub parse {
my $source = shift;
my @result = ();
my @stack = (\@result);
{
if($source=~/\G[\s,]*/gc) # skip whitespace
{ redo; }
if($source=~/\G\(\{/gc) # start of a new part
{ push @stack,[]; redo; }
my $part;
if($source=~/\G\}\)/gc) # this part is done
{ $part = pop @stack; }
elsif ($source=~/\G"(.*?)"/gc) # quoted string
{ $part = $1 }
elsif ($source=~/\G(\d+)/gc) # unquoted decimal
{ $part=$1; }
elsif (pos($source)==length($source)) # done
{ last; }
else { # something alien
die "I don't get it at '".
substr($source,pos($source))."'"
}
my $target = $stack[$#stack]; # where to add part
push @{$target},$part; # add it!
redo;
}
return @result;
}
Update: JamesNC is da man... And I have thick fingers | [reply] [d/l] |
|
typo at line:37
is-> push @($target},$part; # add it!
needs a left curly-fry..mmm
push @{$target},$part; # add it!
| [reply] [d/l] [select] |
Re: Parsing Text into Arrays..
by gjb (Vicar) on Jan 20, 2003 at 09:40 UTC
|
| [reply] |
|
Hmm, thanks for the ideas. Both seem to look for and return complete delimited strings. I was looking for one where I can specify start, end, and between-delims, and get back an object/array with the result nested if necessary..
(I could use them to extract parts, but then thats nearly the same as doing it by hand..)
C.
| [reply] |
Re: Parsing Text into Arrays..
by I0 (Priest) on Jan 20, 2003 at 14:48 UTC
|
use strict;
use Data::Dumper;
sub lpcarray_to_array{
my ($larray)=@_;
my @result=();
my $re = $larray;
$re =~ s/((\([[{])|([]}]\))|.)/${[')','']}[!$3]\Q$1\E${['(','']}[!
+$2]/gs;
$re = join'|',map quotemeta,eval{$larray=~/$re/};
die $@ if $@;
while( $larray=~/(\(\{($re)\}\))|(\(\[($re)\]\))|("([^"]*)")|(\d+)
+/g ){
if( $1 ){
push @result,[lpcarray_to_array($2)];
}elsif( $3 ){
push @result,{lpcarray_to_array($4)};
}elsif( $5 ){
push @result,$6;
}else{
push @result,$7;
}
}
return @result;
}
print Dumper lpcarray_to_array qq(({ 1, 2, "three", 0, ({ "internal",
+"array", 0, }) "end", }));
print Dumper lpcarray_to_array qq(([ "key1": "value", "key2": ({ "valu
+e", "array", }) ]));
| [reply] [d/l] |
|
Is this using the extensions in 5.6 that allow recursion in regex's?
| [reply] |
|
| [reply] |
Re: Parsing Text into Arrays..
by PodMaster (Abbot) on Jan 20, 2003 at 12:47 UTC
|
Text::ParseWords - parse text into an array of tokens or array of arrays *cough*
Parse::Tokens is worth checking out (kinda like HTML::Parser).
String::Tokeniser might be interesting, but doesn't look it to me ;)
MJD says you
can't just make shit up and expect the computer to know what you mean, retardo!
** The Third rule of perl club is a statement of fact: pod is sexy.
|
| [reply] |
Re: Parsing Text into Arrays..
by BrowserUk (Patriarch) on Jan 20, 2003 at 16:39 UTC
|
Apart from that I reversed the sense of your delimiters so
([...]) was for arrays and ({...}) for hashes, which for some reason seemed more natural to me--it's a 1 char change to put it back I think--this appears to handle most stuff I gave it. This includes the structure delimiters embedded with quoted strings but not "'s embedded in "'s. I haven't worked out how to do that (easily) yet.
Sample output
========================================
From ;'([ 1, 2, "([", 0, ([ "internal", "})", 0,({ "key1": "value", "k
+ey2": ([ "value", "array", ]), }), }), "end", ])'
$VAR1 = [
'1',
'2',
'([',
'0',
[
'internal',
'})',
'0',
{
'key1' => 'value',
'key2' => [
'value',
'array'
]
}
],
'end'
];
The code
Interesting problem, almost as if it was designed to be complex:)
Examine what is said, not who speaks.
The 7th Rule of perl club is -- pearl clubs are easily damaged. Use a diamond club instead. | [reply] [d/l] [select] |
Re: Parsing Text into Arrays..
by demerphq (Chancellor) on Jan 20, 2003 at 22:29 UTC
|
Any time you think to yourself "thats too complicated for a regex, and its too complicated for Text::Balanced so I guess I need to write a parser" you should reach for the handy dandy TheDamian approved Parse::RecDescent.
Heres your problem solved using that module. And with an extra thrown in, the grammer supports a {{ hash_key=>({"value"}) }} syntax as well just to show how easy it is. (If you dont like the hash functionality you can remove it easily. :-)
Incidentally it seems like you are using perl like syntax so that ({,,,,1,,,,}) parses in the same way that [,,,,1,,,,] would parse in perl. I took this a step further and gave it the same behaviour as perl, at least as far as its handling of barewords and the phat comma => go. This slightly complicated matters but not too much.
Anyway, what you haven't really said (or I didnt notice :-) is what happens if you try to parse "". This code will fail (return undef) in that case.
UPDATE: Sigh. Like a muppet I didnt pay attention to the fact that you _have_ provided a hash notation specification. Well, heres a version that handles that. Ive kept my original perl version though. Sorry about that. Anyway, heres a version that parsers AFAICT what you want. Notice it isnt all that different from the other version. (Ive readmore'd my original stuff)
use strict;
use warnings;
use Parse::RecDescent;
use Data::Dumper;
$::RD_WARN = 0 || undef; # unless undefined, also report non-fa
+tal problems
$::RD_HINT = 0 || undef; # if defined, also suggestion remedies
$::RD_TRACE = 0 || undef; # if defined trace the parse
$::RD_AUTOACTION ='$item[1]';
my $parser = Parse::RecDescent->new(<<'ENDGRAMMAR');
# BEGIN GRAMMAR
# we want to parse things like...
# ({ 1, 2, "three", 0, ({ "internal", "array", 0, }), "end", })
# Base Rule (start-rule)
expr : value /\z/
# Recursion point
value : array
| hash
| string
| number
# list of items. returns an arrayref
# we grep out the undefs so ",,1,," is treated as one element and not
# three or five or any other number :-)
array : "({" <leftop: val_or_empty ',' val_or_empty > "})"
{
[ grep !UNIVERSAL::isa($_,"Value::Empty"),@{$item[2]} ]
}
hash : "([" <leftop: keyvalue ',' keyvalue > "])"
{
$return={};
!UNIVERSAL::isa($_,"Value::Empty") and
($return->{$_->[0]}=$_->[1]) foreach @{$item[2]};
}
keyvalue : string_or_number ":" value
{
[ @item[1,3] ]
}
| empty
string_or_number : string
| number
val_or_empty : value
| empty
empty : ""
{
bless \do{my $x},"Value::Empty"
}
# quoted escaped string. escaping reversed and quotes removed.
string : /"((?:[^"\\]+|\\"|\\)+)"/
{
my $ret=$1;
$ret=~s/\\"/"/g;
$ret;
}
# number. could be a better regex
number : /\d+/
ENDGRAMMAR
my $tests=<<'ENDTEST';
({ 1, 2, "three", 0, ({ "internal", "array", 0, }), "end", })
1
"This is \"quoted\" dude"
({})
({({}),({})})
({"wow",({1}),"bob",({1}),"cool"})
(["cool":1,"subhash":(["b":"c",1:2]),"a":({1,2,3,([]),}),])
ENDTEST
foreach my $test (split /\n/,$tests) {
print "------------------\n\n";
my $value=$parser->expr($test);
print "'$test'\n";
if ( defined $value ) {
print "\nproduced the following structure:\n\n";
print Data::Dumper->new([$value])
->Terse(1)
->Indent(1)
->Dump(),"\n";
} else {
print "Failed to parse!\n";
}
}
Incidentally the secret to easily parsing stuff like this using P::RD is to learn how to use <leftop:> properly.
Personally since I learned of and about Parse::RecDescent I would never write a parser in perl without it. (Well, maybe with another Parse:: module instead...)
HTH, and thanks for the post. I found this a fun use of P::RD. To be honest I rewrote it a couple of times as I played with different ways to do this (and extend it). Even still assuming you grok <leftop:> I think a beginner with P::RD could pull off your original requirements in pretty good time. I say this only to reduce the chance that you get intimdated by P::RD's rather long (for good reason) documentation. Its actually a lot easier than it seems at first glance most times.
NOTE TO THE EXPERIENCED P::RD USER How do you return undef from an action without causing the rule to fail? I dont see any way to do it, which means to me that some kind of unwrapping potentially becomes necessary after the parse. Any ideas?
Cheers,
--- demerphq
my friends call me, usually because I'm late....
| [reply] [d/l] [select] |
|
Hi demerphq,
Thanks for the P::RD grammar and the thoughts. If you read my original post to the end, you'd see I did indeed think about using P::RD, but came to the conclusion that thats a lot of overhead for something that can also be done in a couple of subs.
I needed a language parser recently, found P::RD, was impressed, and used it. The docu sure is long, but well-written and easy to understand (in my opinion), I haven't used <leftop> yet, but I managed my task without it, and I guess I'd get this one too if I tried, if not as terse as your soloution :)
Like I mentioned above, this is a real-protocol used to communicate between Muds and a communications server/router. Info here Intermud3 spec if anyones interested. So I'd hope it won't throw empty elements at me, but you never know. :)
As to returning undef without causing the rule to fail, wasn't that just a case of doing:
rule : production { $return = undef; 1; }
?
Or am I understanding you wrong?
I think I'll go benchmark these solutions.. Thanks everyone for the answers!
C. | [reply] [d/l] |
|
({ 1, 2, "three", 0, ({ "internal", "array", 0, }), "end", })
^ ^
| |
Here and Here
If you read my original post to the end, you'd see I did indeed think about using P::RD
I did read it. I just didn't comprehend it properly. I think its cause my laptop has a small screen and after a while of using it I stop focussing properly. I need to get a monitor at home. Anyway yesterday I seemed to overlook a lot of things. Sorry about that. :-)
rule : production { $return = undef; 1; }
? Or am I understanding you wrong?
Actually I don't think that works. (At least I tried it and it did the below, but considering my other oversights of yesterday I make no strong claims :-) It seems to be semantically equivelent to
rule : production { $return = 1; }
As for leftop it just means you dont have to do tortured things to avoid left recursion which as far as I can tell is involved in this grammar. (I could be wrong).
Anyway, it was fun. Cheers. Oh when you benchmark dont forget failure cases as well as success. :-)
--- demerphq
my friends call me, usually because I'm late....
| [reply] [d/l] [select] |
Re: Parsing Text into Arrays..
by castaway (Parson) on Jan 21, 2003 at 16:02 UTC
|
First test results:
Testdata:
({"startup-req-3",5,"PerlMud-i3",0,"*gcs",0,0,0,0,0,0,0,"PerlLib v1","
+PerlLib v1","Perl v5.6.1","Perl","restricted access","blah@blah.de",(
+["tell":1,"ucache":1,]),0,})
IO's code produces [], merlyns 'undef' <- (I tried to change the parens to ([ ]) etc. here), Gilimanjaros gives up on the hash, demerphqs parser is created before the benchmark runs.
So I guess the timings are a little skewed anyway ;)
Benchmark:
running
IO, castaway, demerhpq, merlyn, browserUK
, each for at least 5 CPU seconds
...
IO: 1 wallclock secs ( 1.09 usr + 0.01 sys = 1.10 CPU) @ 37.27/s (n
+=41)
browserUK: 1 wallclock secs ( 1.04 usr + 0.01 sys = 1.05 CPU) @ 113
+.33/s (n=119)
castaway: 1 wallclock secs ( 1.07 usr + 0.00 sys = 1.07 CPU) @ 103.
+74/s (n=111)
demerhpq: 1 wallclock secs ( 1.07 usr + 0.00 sys = 1.07 CPU) @ 3.7
+4/s (n=4)
merlyn: 1 wallclock secs ( 1.04 usr + 0.01 sys = 1.05 CPU) @ 187.62
+/s (n=197)
Rate demerhpq IO castaway browserUK merlyn
demerhpq 3.74/s -- -90% -96% -97% -98%
IO 37.3/s 897% -- -64% -67% -80%
castaway 104/s 2675% 178% -- -8% -45%
browserUK 113/s 2932% 204% 9% -- -40%
merlyn 188/s 4919% 403% 81% 66% --
Benchmark called like this:
cmpthese(-5, {
# Gilimanjaro => sub {my @arr = parse($testarray1)},
castaway => sub {my @arr = lpcarray_to_array( $test
+array1 ); print Dumper(@arr)},
IO => sub{my @arr = lpcarray_to_arrayIO(
+$testarray1 ); print Dumper(@arr)},
merlyn => sub{my @arr = parse_string( $testarra
+y1); print Dumper(@arr)},
demerhpq => sub{my $arr = $parser->expr($testarray1)
+; print Dumper($arr)},;
browserUK => sub{my $arr = build_structure($testarray
+1); print Dumper($arr)}});
More to come when I figure out how to get the rest running properly ;)
Update: Hmm, that seems to be junk, since cmpthese isn't reading $testarray1 (which is declared as 'my').. *sigh*
Update 2: Real results, I hope, thanks BrowserUK!
my script results (not a full time webserver)
C. | [reply] [d/l] [select] |
|
FWIW: Here's a cleaned up version of mine, which runs a bit quicker as a consequence. There were several stupidities in the previous version.
Updated Code updated to corrected a major flaw exposed by better testing. Slowed it down a bit, bit not so very much.
our $re_terminate = qr[ \s* (?: : | , | $ ) ]x;
our $re_content = qr[ \s* \( ( [\[\{] ) (.+) [\]\}]\) $re_terminate
+ ]x;
our $re_nested = qr[ \( .+ \) ]x;
our $re_serial = qr[ ( \( .+? \) ) $re_terminate ]x;
our $re_qq = qr' " ( .+? ) (?!<\\)" 'x;
our $re_csv = qr[ \G \s* (?: $re_qq | ( $re_nested ) | ( [^\s:,
+]+? ) ) $re_terminate ]x;
sub build_structure{
warn( "Data didn't match spec.", $_[0] )
, return undef
unless $_[0] =~ m[$re_content];
my ($reftype, $content, $chunk, @bits) = ($1, $2);
my $p = pos($content);
while( $content =~ m[$re_csv]gc) {
$chunk = $1||$3||$2||'0';
if ($chunk =~ m[^ \( [^\(]+? \) ]x) {
pos($content) = $p;
$content =~ m[$re_serial]gc;
$p = pos($content);
$chunk = $1;
}
push @bits, $chunk =~ m[^\(]
? build_structure( $chunk )
: $chunk;
}
return $reftype eq '[' ? {@bits} : \@bits;
}
Examine what is said, not who speaks.
The 7th Rule of perl club is -- pearl clubs are easily damaged. Use a diamond club instead. | [reply] [d/l] |
|
Hmm. While I am not suprised about the P::RD solution that I wrote being the slowest, I would like to point a couple of things out. Most of these solutions are buggy or not appropriate for the data set you are using in some way or another (I can feel the venom in BrowserUks eyes already ;-)
And by buggy I mean the following: I dont believe that you can consider a parser correct if it accepts something other than the specification it was intended to parse. Ie, I consider a parser to work correctly if and only if it parses a language specification correctly and nothing more.
Anyway heres my analysis of the different solutions. Additional stuff is welcome.
So first off I would say that when you test code you should test both positive and negative cases. Second off I can't feel bad about having the slowest solution as it quite frankly is the only one that correctly parses or rejects the data it recieves.
To quote from Code Complete (28.2 Code Tuning)
A fast program is just as important as a correct one--False! It's hardly ever true that programs need to be fast or small before they need to be correct. Gerald Weinberg tells the story of a programmer who was flown to Detroit to help debug a program that was in trouble. The programmer worked with the team of programmers who had developed the program, and after several days he concluded that the situation was hopeless.
On the flight home, he mulled over the last few days and realized the true problem. By the end of the flight, he had outlined the necessary code. He tested it for several days and was about to return to Detroit when he got a telegram saying the project had been cancelled because the program was impossible to write. He headed back to Detroit anyway and convinced the executives that the project could be completed.
He then had to convince the project's original programmers. They listened to his presentation, and when he'd finished, the creator of the old system asked,
"And how long does your program take?"
"That varies, but about ten seconds per input."
"Aha! But my program takes only one second per input." The veteran leaned back, satisfied that he'd stumped the upstart programmer. The other programmers seemed to agree, but the new programmer wasn't intimidated.
"Yes, but your program doesn't work. If mine doesn't have to work, I can make it run instantly and take up no memory. "
(emphasis added by me.)
However I have no doubt that with a bit of hacking all of the people involved here can fix their stuff and still have it faster than the P::RD approach, but this is a good example of why proper testing of both positive and negative cases is essential. Its also a good example of why P::RD is a cool tool. You are much more likely to produce a correct (but slow) result with it than without it, and to do so in much less time than with another approach.
A last point about all of this. You are reciving packets over a network connection. So I'm guessing that just the network part takes much longer than even the slowest solution. Thus the network time is going to swamp the parse time by a great deal, and to me would suggest theres no point in optimizing this.
I think having a read of the code complete article is a worthy use of time.
Cheers,
--- demerphq
my friends call me, usually because I'm late....
| [reply] [d/l] [select] |
|
Seems you are antisipating a response from me, so here it is--sans venom (Good ol' Optrex :^).
I dont believe that you can consider a parser correct if it accepts something other than the specification it was intended to parse.
I agree.
However (there had to be an however :), given the specification was 2 simple examples of sample input, it's difficult to derive a complete specification or a full set of testcases. My original version already exceeded the specification in as much as it attempted to handle the case of delimiters embedded in quotes.
Whilst you can sit down and derive a set of possible testcases that it should handle, in the absence of an accurate, full specification, all you can do is guess. I also think that given the nature of PM, it falls on the OP to test and modify where necessary any solutions provided. If I were writing for production purposes, and being paid to do it, I would do considerably more testing, but then I would also demand more information up front regarding the nature of the task.
That in no way contradicts your completeness argument, but it does add a little context to the equation.
In a much later post, castaway posted a real sample of the (predefined to him/her) protocol:
({"startup-req-3",5,"PerlMud-i3",0,"*gcs",0,0,0,0,0,0,0,"PerlLib v1","PerlLib v1","Perl v5.6.1","Perl","restricted access","blah@blah.de",(["tell":1,"ucache":1,]),0,})
which appears to indicate that no spaces are included (except within double quotes), which contrasts with the earlier simple examples. If the real specification states that there will be no inter-term white space, then it would simplify the problem and probably speed up the processing.
A last point about all of this. You are reciving packets over a network connection. So I'm guessing that just the network part takes much longer than even the slowest solution. Thus the network time is going to swamp the parse time by a great deal, and to me would suggest theres no point in optimizing this.
True, speed isn't everything (or indeed anything unless the code 'works'), but I eshew the idea that taking longer than necessary is ok if the process is IO-bound anyway.
There are very few cases these days where a computer is either single-tasking, or doing only one task.
In the case of a MUD, whilst some part of the process is reading and passing the input from the network, concurrent with this, there is almost certainly another part of the program that is processing the input received and updating state as a concequence of it; Another part that is getting, validating and actioning user (keyboard/mouse) input. If it is a graphical MUD, then maintaining the screen is likely to require every spare cycle it can get and then some.
Even if this is a single threaded/tasking MUD that cannot itself do any concurrent processing, the user is quite likely to be running a browser, compiler, chat progs etc. simulataneuosly, and extra cycles used by the MUD spends parsing network input is detracting from them and anything else running. If the parser were being used at the server end for receiving information back from multiple clients on seperate threads, the performance would become critical.
Back to the Completeness argument.
Maybe the antithisis of the article you cite is the adage (possibly by one Paul Schindler, but I haven't confirmed this, nor looked to check his credentials) of "'The perfect', is the enemy of 'the good'".
Or as I seen it written "'Good enough' is good enough" and "OP = OP" (with apologies to my dutch friends if the latter is mis-interpreted).
In this case, if the server sent me a string of ({({}) or ({foo}) or ({})}), I would consider the output routine at the server broken rather than the input parser. Of course, corruption can occur during transmission, but I would expect that to be detected and correct by the transmission protocol rather than needing to account for the possibility in the parser.
Of course, if the protocol/parser involved was responsible for running a nuclear powerstation, a heart monitor, the space shuttle (or commercial aeroplane) or the stock market, I would feel differently about this, but we are talking about a MUD.
Finally, I am not at all sure that this is an 'answer' to your post exactly, nor even contradictory to it. It's more that there's more than one point of view and that neither is necessarially wrong. Sometimes one is more applicable to a given situation than another.
Examine what is said, not who speaks.
The 7th Rule of perl club is -- pearl clubs are easily damaged. Use a diamond club instead.
| [reply] [d/l] [select] |
|
|
|
Fun..
Ok, lets answer a few points. You are of course correct that a finished solution should cater for incorrect input and act accordingly. It's also useful that the P::RD version does this without much extra effort.
And its missing from my code, though adding it wont be too much hassle, I think :) I'll go look at the errors in mine, and implement a speed improvement that occured to me while getting up (all the best ideas appear in the waking up phase, somehow..)
The second point is, that in order for incorrect data to be 'correctly' handled, one also has to define what should happen when the data is bad. So here's another challenge for you. Look at the data on castaway's scratchpad, if this sort of packet is incomplete, then it could still have complete information for some of the muds which I can use, so how about returning all the 'full' lists when possible? Though, thats now data-bound, and not just about complete structures any more.. If its not clear, the first seven entries are just a header, the first 'mud' information starts at (["Figment", and goes on until the end of the hash.
I never doubted that Parse::RecDescent was cool :) And I've got a 'Code Complete' around here somewhere, he's right too..
Oh, did you benchmark working versions of IOs and merlyns solutions?
/me goes to add some error-checking..
C.
| [reply] [d/l] |
|
| [reply] [d/l] |
|
| [reply] |
Re: Parsing Text into Arrays..
by castaway (Parson) on Jan 24, 2003 at 11:30 UTC
|
Hi all,
(If anyones still listening, as it were..)
Just out of interest, at the moment I am using demerphqs solution in my program. Thats not to say that any others are bad/worse/whatever, but I prefer one that I understand, and can change if necessary, most of those with 'cryptical' regexes I don't as much.
I tried correcting some errors in mine, to test for faulty input, and it wasn't that easy :) And since I wanted to see some results.. Maybe I'll fix it one of these days.
The difference in performance is noticable however, especially when I was importing a 500k file, using my functions imported it in about the time it took me to switch apps and back, using P::RD gives me enough time to make a cup of tea (or two) and probably a sandwich to go with it :) - Luckily it was a one-time import, parsing smaller inputs on the fly is fast enough not to notice the difference.
One small niggle: Some of the strings received contain escaped apostrophes "Blah \'s Mud" which the parser returns as: "Blah \\\'s Mud", seems to be adding \ instead of removing them :)
I had to add a cheat to mine which reconcatenates strings that had commas in them prior to me splitting them apart, so now Im looking for a better approach.. Maybe instead of splitting, attempting to chop off each type from the input string.. ?
Anyfish, I'll get back to you.. (And try out barts solution..)
C. | [reply] [d/l] [select] |
Re: Parsing Text into Arrays..
by bart (Canon) on Jan 23, 2003 at 00:57 UTC
|
I promised a Parse::RecDescent-free parser, and here it is. It does simple items, arrays, mappings, nested structures, even mappings with arrays as keys. (People curious about the syntax should Google for "LPC".) I haven't tested it thoroughly, but it seems to work pretty well. String-postprocessing is non-existent, so you get the original source back, even including the quotes.
I had a version using s/^PATTERN// as well, but for some reason, I thought that using /\GPATTERN/gc looked cuter.
BTW, this is a recursive descent parser.
use Data::Dumper;
print Dumper parse_lpc_expr(
q#({ 1, 2, "three", 0, ({ "internal", "array", 0, }),
([ ({"key1", "key2"}): "value", "key": "anothervalue" ]),
({ "arrays", ({ "even", ({ "nest"})})}), "end", })#);
# wrapper:
sub parse_lpc_expr {
local $_ = shift;
s/(?=[\015\012])\015?\012?/\n/g;
parse_lpc_item();
}
# Actual parser starts here:
sub parse_lpc_item {
my $result;
/\G\s*/gc;
pos() < length or return;
if(/\G\(\{/gc) {
$result = parse_lpc_array();
/\G\s*\}\)/gc or error("})");
} elsif (/\G\(\[/gc) {
$result = parse_lpc_mapping();
/\G\s*\]\)/gc or error("])");
} elsif (/\G(-?(?:\d+\.?\d*|\.\d+))/gc) {
$result = $1;
} elsif (/\G("(?:[^"\n\\]|\\.|"\s*\")*")/sgc) {
$result = $1; # lazy
} else {
error();
}
return $result;
}
sub parse_lpc_array {
my @result;
while(length) {
last if /\G(?=\}\))/gc;
my($item) = parse_lpc_item() or last;
push @result, $item;
/\G\s*,\s*/gc or last;
}
return \@result;
}
sub parse_lpc_mapping {
my %result;
while(length) {
last if /\G(?=\]\))/gc;
my($key) = parse_lpc_item() or last;
/\G\s*:\s*/gc or error(":");
my($value) = parse_lpc_item() or error();
if(ref $key eq 'ARRAY') {
foreach(@$key) {
die "A mapping should only have simple keys, or arrays
+ of simple keys\n" if ref;
$result{$_} = $value ;
}
} elsif(ref $key) {
die "A mapping can't have a mapping as a key\n";
} else {
$result{$key} = $value;
}
/\G\s*,\s*/gc or last;
}
return \%result;
}
sub error {
my($expect) = @_;
s/\G\s*(\S{0,5})/^$1/;
if(defined $expect) {
die "Expected \"$expect\", found \"$1\" at \"$_\"\n";
} else {
die "Can't parse \"$1\" at \"$_\"\n";
}
}
Result:
$VAR1 = [
'1',
'2',
'"three"',
'0',
[
'"internal"',
'"array"',
'0'
],
{
'"key1"' => '"value"',
'"key"' => '"anothervalue"',
'"key2"' => '"value"'
},
[
'"arrays"',
[
'"even"',
[
'"nest"'
]
]
],
'"end"'
];
| [reply] [d/l] [select] |
|
|