in reply to A Question on a homebrew XML parser
Well, I just wanted to let everyone know that I have made some major improvements to my code. I have added many features including CDATA sections (please look at the HACK section. It it massively sub optimal), Safe perl execution of PI code, well formedness testing (although, I am not quite happy with the way I did it), DTD recognition and storage (I don't really parse it up to try to validate yet), an OO interface.
I am still having problems with end of file testing. I am at a loss to figure it out right now so I did other stuff pending. Also, I thought about the ">" ending tag recognition. If I remember correctly, attribute values have to be bounded by quote so that should not be a problem. If a ">" can be part of an attribute name then that causes problems. I probably should look into it.
If you have any ideas, it would be cool to comment on it. Anyway, here is the code. Have fun and thanks for all the comments from everyone before.
package CyParser; require 5.6.1; use warnings; use strict; use utf8; use FileHandle; use IO::String; use Safe; our $VERSION = "1.00"; sub new { my $class = shift; my $input = shift; my $self = {}; bless $self, $class; if(defined $input) { $self->setInput($input); } return $self; } sub setInput { my $self = shift; my $input = shift; if($input->isa("IO::Handle")) { $self->{input} = $input; }else { if($input =~ m/\.xml/) { $self->{input} = new FileHandle "$input", "r"; }else { $self->{input} = new IO::String $input; } } if(not exists $self->{input} and not defined $self->{input}) { die "Invalid input type\n"; } } sub getDocument { my $self = shift; my $doc; if(exists $self->{doc}) { $doc = $self->{doc}; }else{ die "A document has not been parsed yet.\n"; } if(not defined $doc) { die "There is no document\n"; } return $doc; } sub reset { my $self = shift; delete $self->{doc}; delete $self->{input}; } sub getVerson { return $VERSION; } sub parse { #args my $self = shift; my $fh = $self->{input}; #pre-declared vars my %doc; my $preSymbol; my $currSymbol; my $currElement; my $depth = 0; CHAR: while(defined($currSymbol = getc($fh))) { if($currSymbol eq "<") { my $nextSymbol = getc($fh); my %element; my $elementName = ""; my $attributes = ""; #if this is a letter then this is a start tag #then read until you reach a space. #attribs should be following or the end of the tag if($nextSymbol =~ m/\p{IsAlpha}/) { $preSymbol = $currSymbol; $currSymbol = $nextSymbol; $depth++; until($currSymbol eq " " or $currSymbol eq ">") { $elementName .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } if($currSymbol eq " ") { until($currSymbol eq ">") { if($currSymbol eq "/") { $depth--; next CHAR; } $attributes .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } } if($currSymbol eq ">") { $element{name} = $elementName; $element{children} = []; $element{depth} = $depth; my %attribHash; #unrolling attributes into a hash #the attribute name is the hash key and the #attribute value is the value in the hash my @attribs = split /\p{IsSpace}/, $attributes; foreach my $attrib (@attribs) { my ($key, $val) = split /\p{IsSpace}*=\p{IsSpace}*/, $att +rib; if(not defined $key) { next; } $val =~ s/\"//g; $attribHash{$key} = $val; $element{attributes} = \%attribHash; } if($depth == 1) { $doc{root} = \%element; push @{$self->{starttags}}, $element{name} . "\t" . $eleme +nt{depth}; $currElement = $doc{root}; }else{ #this is checking to see if this element is #an empty element. The : are the boundries #for the regular expressions. if($element{name} =~ m:/:) { $element{name} =~ s:/::; push @{$currElement->{children}}, \%element; push @{$self->{endtags}}, $element{name} . "\t$depth"; push @{$self->{starttags}}, $element{name} . "\t$depth +"; $depth--; next CHAR; }else { push @{$currElement->{children}}, \%element; push @{$self->{starttags}}, $element{name} . "\t" . $e +lement{depth}; $currElement = \%element; next CHAR; } } }else { die "Symbol: $currSymbol is not recognized\n"; } } #if this is a / then this is an end tag #then read until the end > elsif($nextSymbol eq "/") { my $name = ""; $preSymbol = $currSymbol; $currSymbol = $nextSymbol; until($currSymbol eq ">") { $name .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $name =~ s:/::g; #save for well formedness testing push @{$self->{endtags}}, $name . "\t$depth"; #set the depth correctly $depth--; #went forward one too may symbols seek($fh, -1, 1); } #if this is a ? then this is a processing instuction #read until the space for the application name #then compare that to make sure that it is not the xml decl #that happens at the beginning of a document #then read until the next ? elsif($nextSymbol eq "?") { my $appName = ""; my $appInfo = ""; $preSymbol = $currSymbol; $currSymbol = getc($fh); until($currSymbol eq " ") { $appName .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } if($appName eq "xml" or $appName eq "XML") { until($currSymbol eq "?") { $appInfo .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } #break up pseudo-attribs for easier access #and analysis my %xmlAttribs; my @attribs = split /\p{IsSpace}/, $appInfo; foreach my $attrib (@attribs) { my ($key, $val) = split /\p{IsSpace}*=\p{IsSpace}*/, $att +rib; if(not defined $key) { next; } $val =~ s/\"//g; $xmlAttribs{$key} = $val; } $doc{xmldecl} = \%xmlAttribs; next CHAR; }else { until($currSymbol eq "?") { $appInfo .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $currElement->{PI}->{$appName} = $appInfo; if(exists $currElement->{PI}->{CyParser}) { my $cpt = new Safe; $cpt->share_from("XML::CyParser", ['%doc', '$currSymbol', +'$preSymbol', '$nextSymbol']); $cpt->reval($currElement->{PI}->{CyParser}); if($@) { warn $@; } } } }elsif($nextSymbol eq "!") { $preSymbol = $currSymbol; $currSymbol = getc($fh); if($currSymbol eq "[") { my $cdata = ""; my $ending = ""; $preSymbol = $currSymbol; $currSymbol = getc($fh); until($currSymbol eq "[") { $cdata .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } if($cdata ne "CDATA") { die "This is not a CDATA section!\n"; } $preSymbol = $currSymbol; $currSymbol = getc($fh); until($currSymbol eq "]") { $currElement->{content} .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } #this steps through the ending for the #CDATA section $ending .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); $ending .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); $ending .= $currSymbol; my ($first, $second, $thrid) = split //, $ending; #Hack: this should be a regex like #m/\[\[>/ but perl would not accept it if($first ne "]" and $second ne "]" and $thrid ne ">") { die "CDATA section was not ended correctly\n"; } }elsif($currSymbol eq "-") { $preSymbol = $currSymbol; $currSymbol = getc($fh); die "This is not a comment\n" unless $currSymbol eq "-"; $preSymbol = $currSymbol; $currSymbol = getc($fh); until($currSymbol eq "-") { $preSymbol = $currSymbol; $currSymbol = getc($fh); } }elsif($currSymbol =~ m/(\p{IsAlpha})/ and $depth == 0) { my $dtddecl; $dtddecl .= $preSymbol; $dtddecl .= $nextSymbol; $dtddecl .= $1; $preSymbol = $currSymbol; $currSymbol = getc($fh); until($currSymbol eq "]") { $dtddecl .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $doc{dtd} .= $dtddecl . $currSymbol . ">"; } } }elsif($currSymbol =~ m/\p{IsAlnum}|\p{IsSpace}/) { my $content = ""; my $entity = ""; #parsing and replacing general entities until($currSymbol eq "<") { if($currSymbol eq "&") { $entity .= $currSymbol; until($currSymbol eq ";") { $preSymbol = $currSymbol; $currSymbol = getc($fh); $entity .= $currSymbol; } #these need to deal with double entities like #& and < if($entity eq "&") { $content .= "&"; }elsif($entity eq "&") { until($currSymbol eq ";") { $preSymbol = $currSymbol; $currSymbol = getc($fh); $entity .= $currSymbol; } if($entity eq "&&") { $content .= "&"; }elsif($entity eq "&<") { $content .= "<"; }else{ die "Entity & not fully entered: $entity\n"; } }elsif($entity eq ">" or $entity eq ">") { $content .= ">"; }elsif($entity eq "<") { $content .= "<"; }elsif($entity eq "'" or $entity eq "'") { $content .= "\'"; }elsif($entity eq """ or $entity eq """) { $content .= "\""; } #go to the next char to get checked $preSymbol = $currSymbol; $currSymbol = getc($fh); }else { $content .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } } $currElement->{content} .= $content; #went forward one too many symbols seek($fh, -1, 1); next CHAR; } } $self->{doc} = \%doc; $self->checkWF(); } sub checkWF { my $self = shift; my ($package, $filename) = caller; #check to make sure that this is called ONLY from this parser unless($package eq __PACKAGE__ and $filename eq __FILE__) { die "CheckWF is an internal function and can only be called by the + XML::CyParser\n"; } eval { my @{$self->{endtags}} = sort @{$self->{endtags}}; my @{$self->{starttags}} = sort @{$self->{starttags}}; }; if($@) { die "This document is not well formed at the root element.\n"; } my %starttags = map {$_ => 1} @{$self->{starttags}}; my @diffs = grep {not $starttags{$_}} @{$self->{endtags}}; if(scalar @diffs != 0) { my $message = "Document not well formed at:\n"; foreach my $diff (@diffs) { $message .= "$diff\n"; } delete $self->{endtags}; delete $self->{starttags}; die $message; }else { delete $self->{endtags}; delete $self->{starttags}; } }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Re: A Question on a homebrew XML parser
by mirod (Canon) on Feb 25, 2002 at 12:21 UTC | |
by cyocum (Curate) on Feb 25, 2002 at 22:49 UTC |