use warnings; use strict; use utf8; use FileHandle; use Data::Dumper; my $document = &parseXML("mut.xml"); my $log = new FileHandle "dump.txt", "w+"; print $log Dumper($document); sub parseXML { #args my $file = shift; #pre-declared vars my %doc; my $preSymbol; my $currSymbol; my $currElement; my $depth = 0; my $fh = new FileHandle $file, "r"; my $log = new FileHandle "log.txt", "w+"; CHAR: while($currSymbol = getc($fh)) { print $log "entering while. current: $currSymbol\n"; if($currSymbol eq "<") { my $nextSymbol = getc($fh); print $log "current: $currSymbol next: $nextSymbol\n"; my %element; my $elementName = ""; my $attributes = ""; #if this is a letter then this is a start tag #then read until you reach a space. #attribs should be following or the end of the tag if($nextSymbol =~ m/\p{IsAlpha}/) { $preSymbol = $currSymbol; $currSymbol = $nextSymbol; $depth++; print $log "next symbol is a word char. current: $currSymbol\n"; until($currSymbol eq " " or $currSymbol eq ">") { print $log "Getting element name\n"; $elementName .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } print $log "Element name: $elementName\n"; if($currSymbol eq " ") { until($currSymbol eq ">") { print $log "getting attributes\n"; $attributes .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } } print $log "Attribs are: $attributes\n"; if($currSymbol eq ">") { print $log "done getting element info. Depth: $depth\n"; $element{name} = $elementName; $element{attributes} = $attributes; $element{children} = []; $element{depth} = $depth; if($depth == 1) { $doc{root} = \%element; $currElement = $doc{root}; }else{ #this is checking to see if this element is #an empty element. The : are the boundries #for the regular expressions. if($element{name} =~ m:/:) { print $log "this is an empty element.\n"; $element{name} =~ s:/::; $depth--; my $children = $currElement->{children}; push @{$children}, \%element; print $log "adding to parent node and going to next char.\n"; next CHAR; }else { my $children = $currElement->{children}; push @{$children}, \%element; $currElement = \%element; print $log "going to next char in the while loop.\n"; next CHAR; } } }else { print $log "Symbol: $currSymbol is not recognized\n"; } } #need to determine if this is an end element or #an empty element #if this is a / then this is an end tag #then read until the end > elsif($nextSymbol eq "/") { print $log "next symbol is a / ending element.\n"; $currElement->{status} = "closed"; $depth--; $preSymbol = $currSymbol; $currSymbol = $nextSymbol; until($currSymbol eq ">") { $preSymbol = $currSymbol; $currSymbol = getc($fh); } #went forward one too may symbols seek($fh, -1, 1); print $log "element finished\n"; } #if this is a ? then this is a processing instuction #read until the space for the application name #then compare that to make sure that it is not the xml decl #that happens at the beginning of a document #then read until the next ? elsif($nextSymbol eq "?") { print $log "Found processing instuction.\n"; my $appName = ""; my $appInfo = ""; $preSymbol = $currSymbol; $currSymbol = getc($fh); print $log "Reading app name for PI.\n"; until($currSymbol eq " ") { $appName .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } print $log "appname is $appName.\n"; if($appName eq "xml") { print $log "This is the standard decl for xml.\n"; until($currSymbol eq "?") { $appInfo .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $doc{xmldecl} = $appInfo; next CHAR; }else { print $log "This is a true PI.\n"; until($currSymbol eq "?") { $appInfo .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $currElement->{PI}->{$appName} = $appInfo; } } }elsif($currSymbol =~ m/\p{IsAlnum}|\p{IsSpace}/) { my $content = ""; print $log "adding content to current element.\n"; until($currSymbol eq "<") { $content .= $currSymbol; $preSymbol = $currSymbol; $currSymbol = getc($fh); } $currElement->{content} .= $content; #went forward one too many symbols seek($fh, -1, 1); next CHAR; } } print $log "Current symbol: $currSymbol. Presymbol: $preSymbol\nfinished with document.\n"; return \%doc; }