package CyParser; require 5.6.1; use warnings; use strict; use utf8; use FileHandle; use IO::String; use Safe; our $VERSION = "1.00"; sub new { my $class = shift; my $input = shift; my $self = {}; bless $self, $class; if(defined $input) { $self->setInput($input); } return $self; } sub setInput { my $self = shift; my $input = shift; if($input->isa("IO::Handle")) { $self->{input} = $input; }else { if($input =~ m/\.xml/) { $self->{input} = new FileHandle "$input", "r"; }else { $self->{input} = new IO::String $input; } } if(not exists $self->{input} and not defined $self->{input}) { die "Invalid input type\n"; } } sub getDocument { my $self = shift; my $doc; if(exists $self->{doc}) { $doc = $self->{doc}; }else{ die "A document has not been parsed yet.\n"; } if(not defined $doc) { die "There is no document\n"; } return $doc; } sub reset { my $self = shift; delete $self->{doc}; delete $self->{input}; } sub getVerson { return $VERSION; } sub parse { #args my $self = shift; my $fh = $self->{input}; #pre-declared vars my %doc; my $preSymbol; my $currSymbol; my $currElement; my $depth = 0; CHAR: while(defined($currSymbol = getc($fh))) { if($currSymbol eq "<") { my $nextSymbol = getc($fh); my %element; my $elementName = ""; my $attributes = ""; #if this is a letter then this is a start tag #then read until you reach a space. #attribs should be following or the end of the tag if($nextSymbol =~ m/\p{IsAlpha}/) { $preSymbol = $currSymbol; $currSymbol = $nextSymbol; $depth++; until($currSymbol eq " " or $currSymbol eq ">") { $elementName .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } if($currSymbol eq " ") { until($currSymbol eq ">") { if($currSymbol eq "/") { $depth--; next CHAR; } $attributes .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } } if($currSymbol eq ">") { $element{name} = $elementName; $element{children} = []; $element{depth} = $depth; my %attribHash; #unrolling attributes into a hash #the attribute name is the hash key and the #attribute value is the value in the hash my @attribs = split /\p{IsSpace}/, $attributes; foreach my $attrib (@attribs) { my ($key, $val) = split /\p{IsSpace}*=\p{IsSpace}*/, $attrib; if(not defined $key) { next; } $val =~ s/\"//g; $attribHash{$key} = $val; $element{attributes} = \%attribHash; } if($depth == 1) { $doc{root} = \%element; push @{$self->{starttags}}, $element{name} . "\t" . $element{depth}; $currElement = $doc{root}; }else{ #this is checking to see if this element is #an empty element. The : are the boundries #for the regular expressions. if($element{name} =~ m:/:) { $element{name} =~ s:/::; push @{$currElement->{children}}, \%element; push @{$self->{endtags}}, $element{name} . "\t$depth"; push @{$self->{starttags}}, $element{name} . "\t$depth"; $depth--; next CHAR; }else { push @{$currElement->{children}}, \%element; push @{$self->{starttags}}, $element{name} . "\t" . $element{depth}; $currElement = \%element; next CHAR; } } }else { die "Symbol: $currSymbol is not recognized\n"; } } #if this is a / then this is an end tag #then read until the end > elsif($nextSymbol eq "/") { my $name = ""; $preSymbol = $currSymbol; $currSymbol = $nextSymbol; until($currSymbol eq ">") { $name .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $name =~ s:/::g; #save for well formedness testing push @{$self->{endtags}}, $name . "\t$depth"; #set the depth correctly $depth--; #went forward one too may symbols seek($fh, -1, 1); } #if this is a ? then this is a processing instuction #read until the space for the application name #then compare that to make sure that it is not the xml decl #that happens at the beginning of a document #then read until the next ? elsif($nextSymbol eq "?") { my $appName = ""; my $appInfo = ""; $preSymbol = $currSymbol; $currSymbol = getc($fh); until($currSymbol eq " ") { $appName .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } if($appName eq "xml" or $appName eq "XML") { until($currSymbol eq "?") { $appInfo .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } #break up pseudo-attribs for easier access #and analysis my %xmlAttribs; my @attribs = split /\p{IsSpace}/, $appInfo; foreach my $attrib (@attribs) { my ($key, $val) = split /\p{IsSpace}*=\p{IsSpace}*/, $attrib; if(not defined $key) { next; } $val =~ s/\"//g; $xmlAttribs{$key} = $val; } $doc{xmldecl} = \%xmlAttribs; next CHAR; }else { until($currSymbol eq "?") { $appInfo .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $currElement->{PI}->{$appName} = $appInfo; if(exists $currElement->{PI}->{CyParser}) { my $cpt = new Safe; $cpt->share_from("XML::CyParser", ['%doc', '$currSymbol', '$preSymbol', '$nextSymbol']); $cpt->reval($currElement->{PI}->{CyParser}); if($@) { warn $@; } } } }elsif($nextSymbol eq "!") { $preSymbol = $currSymbol; $currSymbol = getc($fh); if($currSymbol eq "[") { my $cdata = ""; my $ending = ""; $preSymbol = $currSymbol; $currSymbol = getc($fh); until($currSymbol eq "[") { $cdata .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } if($cdata ne "CDATA") { die "This is not a CDATA section!\n"; } $preSymbol = $currSymbol; $currSymbol = getc($fh); until($currSymbol eq "]") { $currElement->{content} .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } #this steps through the ending for the #CDATA section $ending .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); $ending .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); $ending .= $currSymbol; my ($first, $second, $thrid) = split //, $ending; #Hack: this should be a regex like #m/\[\[>/ but perl would not accept it if($first ne "]" and $second ne "]" and $thrid ne ">") { die "CDATA section was not ended correctly\n"; } }elsif($currSymbol eq "-") { $preSymbol = $currSymbol; $currSymbol = getc($fh); die "This is not a comment\n" unless $currSymbol eq "-"; $preSymbol = $currSymbol; $currSymbol = getc($fh); until($currSymbol eq "-") { $preSymbol = $currSymbol; $currSymbol = getc($fh); } }elsif($currSymbol =~ m/(\p{IsAlpha})/ and $depth == 0) { my $dtddecl; $dtddecl .= $preSymbol; $dtddecl .= $nextSymbol; $dtddecl .= $1; $preSymbol = $currSymbol; $currSymbol = getc($fh); until($currSymbol eq "]") { $dtddecl .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $doc{dtd} .= $dtddecl . $currSymbol . ">"; } } }elsif($currSymbol =~ m/\p{IsAlnum}|\p{IsSpace}/) { my $content = ""; my $entity = ""; #parsing and replacing general entities until($currSymbol eq "<") { if($currSymbol eq "&") { $entity .= $currSymbol; until($currSymbol eq ";") { $preSymbol = $currSymbol; $currSymbol = getc($fh); $entity .= $currSymbol; } #these need to deal with double entities like #& and < if($entity eq "&") { $content .= "&"; }elsif($entity eq "&") { until($currSymbol eq ";") { $preSymbol = $currSymbol; $currSymbol = getc($fh); $entity .= $currSymbol; } if($entity eq "&&") { $content .= "&"; }elsif($entity eq "&<") { $content .= "<"; }else{ die "Entity & not fully entered: $entity\n"; } }elsif($entity eq ">" or $entity eq ">") { $content .= ">"; }elsif($entity eq "<") { $content .= "<"; }elsif($entity eq "'" or $entity eq "'") { $content .= "\'"; }elsif($entity eq """ or $entity eq """) { $content .= "\""; } #go to the next char to get checked $preSymbol = $currSymbol; $currSymbol = getc($fh); }else { $content .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } } $currElement->{content} .= $content; #went forward one too many symbols seek($fh, -1, 1); next CHAR; } } $self->{doc} = \%doc; $self->checkWF(); } sub checkWF { my $self = shift; my ($package, $filename) = caller; #check to make sure that this is called ONLY from this parser unless($package eq __PACKAGE__ and $filename eq __FILE__) { die "CheckWF is an internal function and can only be called by the XML::CyParser\n"; } eval { my @{$self->{endtags}} = sort @{$self->{endtags}}; my @{$self->{starttags}} = sort @{$self->{starttags}}; }; if($@) { die "This document is not well formed at the root element.\n"; } my %starttags = map {$_ => 1} @{$self->{starttags}}; my @diffs = grep {not $starttags{$_}} @{$self->{endtags}}; if(scalar @diffs != 0) { my $message = "Document not well formed at:\n"; foreach my $diff (@diffs) { $message .= "$diff\n"; } delete $self->{endtags}; delete $self->{starttags}; die $message; }else { delete $self->{endtags}; delete $self->{starttags}; } }