cyocum has asked for the wisdom of the Perl Monks concerning the following question:
I humbly ask the Perl Monks to look at my code.
First, a little background. I am unemployed at the moment so to keep my skills up I have started to write a home brew XML parser. Now there are many features still missing but I have enough to begin testing against a real XML file.
The problem that I am having is that when I parse this file. The while loop exits on <fifths>0</fifths>. I have been racking my feeble mind to untangle what is happening. I hope that the great Perl Monks may be able to help me.
Here is the code that runs this. Please be lienient since I am right now just trying to the process down before I OO it.
use warnings; use strict; use utf8; use FileHandle; use Data::Dumper; my $document = &parseXML("mut.xml"); my $log = new FileHandle "dump.txt", "w+"; print $log Dumper($document); sub parseXML { #args my $file = shift; #pre-declared vars my %doc; my $preSymbol; my $currSymbol; my $currElement; my $depth = 0; my $fh = new FileHandle $file, "r"; my $log = new FileHandle "log.txt", "w+"; CHAR: while($currSymbol = getc($fh)) { print $log "entering while. current: $currSymbol\n"; if($currSymbol eq "<") { my $nextSymbol = getc($fh); print $log "current: $currSymbol next: $nextSymbol\n"; my %element; my $elementName = ""; my $attributes = ""; #if this is a letter then this is a start tag #then read until you reach a space. #attribs should be following or the end of the tag if($nextSymbol =~ m/\p{IsAlpha}/) { $preSymbol = $currSymbol; $currSymbol = $nextSymbol; $depth++; print $log "next symbol is a word char. current: $currSymbol\n +"; until($currSymbol eq " " or $currSymbol eq ">") { print $log "Getting element name\n"; $elementName .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } print $log "Element name: $elementName\n"; if($currSymbol eq " ") { until($currSymbol eq ">") { print $log "getting attributes\n"; $attributes .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } } print $log "Attribs are: $attributes\n"; if($currSymbol eq ">") { print $log "done getting element info. Depth: $depth\n"; $element{name} = $elementName; $element{attributes} = $attributes; $element{children} = []; $element{depth} = $depth; if($depth == 1) { $doc{root} = \%element; $currElement = $doc{root}; }else{ #this is checking to see if this element is #an empty element. The : are the boundries #for the regular expressions. if($element{name} =~ m:/:) { print $log "this is an empty element.\n"; $element{name} =~ s:/::; $depth--; my $children = $currElement->{children}; push @{$children}, \%element; print $log "adding to parent node and going to next ch +ar.\n"; next CHAR; }else { my $children = $currElement->{children}; push @{$children}, \%element; $currElement = \%element; print $log "going to next char in the while loop.\n"; next CHAR; } } }else { print $log "Symbol: $currSymbol is not recognized\n"; } } #need to determine if this is an end element or #an empty element #if this is a / then this is an end tag #then read until the end > elsif($nextSymbol eq "/") { print $log "next symbol is a / ending element.\n"; $currElement->{status} = "closed"; $depth--; $preSymbol = $currSymbol; $currSymbol = $nextSymbol; until($currSymbol eq ">") { $preSymbol = $currSymbol; $currSymbol = getc($fh); } #went forward one too may symbols seek($fh, -1, 1); print $log "element finished\n"; } #if this is a ? then this is a processing instuction #read until the space for the application name #then compare that to make sure that it is not the xml decl #that happens at the beginning of a document #then read until the next ? elsif($nextSymbol eq "?") { print $log "Found processing instuction.\n"; my $appName = ""; my $appInfo = ""; $preSymbol = $currSymbol; $currSymbol = getc($fh); print $log "Reading app name for PI.\n"; until($currSymbol eq " ") { $appName .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } print $log "appname is $appName.\n"; if($appName eq "xml") { print $log "This is the standard decl for xml.\n"; until($currSymbol eq "?") { $appInfo .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $doc{xmldecl} = $appInfo; next CHAR; }else { print $log "This is a true PI.\n"; until($currSymbol eq "?") { $appInfo .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $currElement->{PI}->{$appName} = $appInfo; } } }elsif($currSymbol =~ m/\p{IsAlnum}|\p{IsSpace}/) { my $content = ""; print $log "adding content to current element.\n"; until($currSymbol eq "<") { $content .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $currElement->{content} .= $content; #went forward one too many symbols seek($fh, -1, 1); next CHAR; } } print $log "Current symbol: $currSymbol. Presymbol: $preSymbol\nfi +nished with document.\n"; return \%doc; }
Edit: chipmunk 2002-01-22
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: A Question on a homebrew XML parser
by mirod (Canon) on Jan 23, 2002 at 12:32 UTC | |
|
Re: A Question on a homebrew XML parser
by wog (Curate) on Jan 23, 2002 at 06:22 UTC | |
|
Re: A Question on a homebrew XML parser
by Matts (Deacon) on Jan 23, 2002 at 12:26 UTC | |
|
Re: A Question on a homebrew XML parser
by Anonymous Monk on Jan 23, 2002 at 06:29 UTC | |
by Matts (Deacon) on Jan 24, 2002 at 02:09 UTC | |
|
Re: A Question on a homebrew XML parser
by cyocum (Curate) on Feb 25, 2002 at 11:11 UTC | |
by mirod (Canon) on Feb 25, 2002 at 12:21 UTC | |
by cyocum (Curate) on Feb 25, 2002 at 22:49 UTC |