| Category: | utilities for XML |
| Author/Contact Info | dsilvia@mchsi.com |
| Description: | If you're like me, generated code is generally difficult to parse visually. I've recently been working on a project that uses XSD code generators and the output code is _very_ confusing to a novice like me. And generally, when I go to generate an XML from the XSD, it asks me questions like "whats the root element", and that's not always readily apparent. So, I dusted off my Perl (haven't used it for about 10yrs) and put together a script that reorganizes the output into something a bit more humanly decipherable. I called the script ReorgXSD.pl, but obviously you can call it whatever you like.;) output from ReorgXSD.pl when no arguments: Usage: ReorgXSD.pl file(s) Reorganizes xsd files to be more human readable. Puts any header information first, then tries to determine a root entity and places it next. Should more than one entity be a possible root entity, all are listed, and if none are found, a comment to that effect is entered. Next, any remaining complex elements, then simple elements, then complex entities, and simple entities. Last whatever is left (if any!;) The only requirement is a well-formed file. Qualification for a root entity is that it not be referenced by any other entity. Results of the process are placed in a like named file with the prefix of 'ReorgXSD'. Shortcomings: Does not analyze DTD entries to determine proper settings for type/attribute/ref, specifically, prefixes. It merely assumes that any prefix ending in a colon (:) is valid an has been provided for. Cannot distinguish between relevance of entities if they are named alike. Comments are not preserved as they cannot be meaningfully interpolated. There is no way to tell if a comment goes with a specific entity or group of entities or the whole file!:-( The concept of non-reference indicating a root element can be naive in the case of very complicated xsd files with imports and other outside references. |
#!perl
use strict;
#use warnings;
use File::Spec::Functions ':ALL';
sub escapeSpecial($)
{
my $str=shift(@_);
return(quotemeta($str));
}
sub getEntityType($)
{
my $wholeStr=shift(@_);
if(!$wholeStr || $wholeStr=~m{^\s*$})
{
return('','');
}
my $comparator='(\s*?<(\b\w+?\b:?\b\w+?\b).*?(?:\bname\b.*?=.*?(?:
+\"|\')\b\w*?\b(?:\"|\'))?.*?>)';
if(!($wholeStr=~m{$comparator}s))
{
return('','');
}
return($1,$2);
}
sub getEntry($)
{
my $wholeStr=shift(@_);
my $entityType=getEntityType($wholeStr);
my $entry;
my $comparator;
my $tail;
if(!$entityType)
{
return($wholeStr,'','','','');
}
($entry,$entityType)=getEntityType($wholeStr);
$tail=substr($entry,length($entry)-2,2);
if($tail eq '/>')
{
return getSimpleEntry($wholeStr,$entityType,$entry);
}
else
{
return getComplexEntry($wholeStr,$entityType,$entry);
}
}
sub getComplexEntry($$$)
{
my $wholeStr=shift(@_);
my $entityType=shift(@_);
my $anyTag='.*?</?(?:[a-z_][a-z0-9.:_-]*?).*?>';
my $closeTag='</(?:xs:|.).*?>';
my $comparator;
my $entityName;
my $entityDefinition;
my $resultStr;
my $beginMatch;
my $matchLen;
my $chunk;
my $theHead;
my $theTail;
my $matches=0;
$comparator=$anyTag;
if($entityType =~ m{(?:xs.*?|.*?)?:?import})
{
$wholeStr=~m{($comparator)}sg;
$entityDefinition=$1;
$matchLen=pos($wholeStr);
$beginMatch=0;
if($wholeStr=~m{(?=$entityDefinition).}sg)
{
$beginMatch=pos($wholeStr)-1;
}
$resultStr=substr($wholeStr,0,$beginMatch).substr($wholeStr,$m
+atchLen);
$comparator='namespace.*?=.*?(?:\"|\')(.*?)(?:\"|\')';
$wholeStr=~m{$comparator}s;
$entityName=$1;
return($resultStr,$entityDefinition,$entityName,$entityType,'c
+omplex');
}
$chunk=$wholeStr;
$chunk=~m{$anyTag}s;
$beginMatch=$-[0];
$matchLen=$beginMatch;
$comparator='\bname\b.*?=.*?(?:\"|\')(\b\w*?\b)(?:\"|\')';
$chunk=~m{$comparator}s;
$entityName=$1;
while($chunk=~m{($anyTag)}sg)
{
my $endOfMatch=$+[0];
my $tmp=$1;
$tmp=~m{<}s;
my $tmpBegin=$-[0];
$theHead=substr($tmp,$tmpBegin,2);
$theTail=substr($tmp,length($tmp)-2);
if($theHead ne '</')
{
if($theHead=~m{<.} and $theTail ne '/>'){$matches++};
}
else{$matches--};
if(!$matches)
{
$closeTag='</'.$entityType.'.*?>';
$chunk=~m{$closeTag}s;
$matchLen+=$+[0];
last;
}
$matchLen+=$endOfMatch;
$chunk=substr($chunk,$endOfMatch);
}
$entityDefinition=substr($wholeStr,$beginMatch,$matchLen);
$resultStr=substr($wholeStr,0,$beginMatch).substr($wholeStr,$begin
+Match+$matchLen);
return($resultStr,$entityDefinition,$entityName,$entityType,'compl
+ex');
}
sub getSimpleEntry($$$)
{
my $wholeStr=shift(@_);
my $entityType=shift(@_);
my $entityDefinition;
my $comparator;
my $entityName;
my $resultStr;
my $beginMatch;
my $endMatch;
$comparator='\s*?<'.$entityType.'.*?(?:\bname\b.*?=.*?(?:\"|\')(\b
+\w*?\b)(?:\"|\'))?.*?/>';
if($wholeStr=~m{($comparator)}sg)
{
$entityDefinition=$1;
$beginMatch=$-[0];
$endMatch=$+[0];
$comparator='\bname\b.*?=.*?(?:\"|\')(\b\w*?\b)(?:\"|\')';
if($wholeStr=~m{$comparator}s)
{
$entityName=$1;
}
$resultStr=substr($wholeStr,0,$beginMatch).substr($wholeStr,$e
+ndMatch);
return($resultStr,$entityDefinition,$entityName,$entityType,'s
+imple');
}
return($wholeStr,'','','','');
}
my @myArgs=@ARGV;
if(!$myArgs[0])
{
my @myPath=splitpath(__FILE__);
my $myName=$myPath[2];
print "\nUsage: ".$myName." file(s)\n\n";
print "Reorganizes xsd files to be more human readable. Puts any
+header information\n";
print "first, then tries to determine a root entity and places it
+next. Should more\n";
print "than one entity be a possible root entity, all are listed,
+and if none are\n";
print "found, a comment to that effect is entered. Next, any rema
+ining complex\n";
print "elements, then simple elements, then complex entities, and
+simple entities.\n";
print "Last whatever is left (if any!;)\n\n";
print "The only requirement is a well-formed file. Qualification
+for a root entity\n";
print "is that it not be referenced by any other entity.\n\n";
print "Results of the process are placed in a like named file with
+ the prefix of\n";
print "'ReorgXSD.\n\n";
print "Shortcomings:\n";
print "\tDoes not analyze DTD entries to determine proper settings
+ for\n";
print "\ttype/attribute/ref, specifically, prefixes. It merely as
+sumes that\n";
print "\tany prefix ending in a colon (:) is valid an has been pro
+vided for.\n\n";
print "\tCannot distinguish between relevance of entities if they
+are named\n";
print "\talike.\n\n";
print "\tComments are not preserved as they cannot be meaningfully
+\n";
print "\tinterpolated. There is no way to tell if a comment goes
+with a\n";
print "\tspecific entity or group of entities or the whole file!:-
+(\n\n";
print "\tThe concept of non-reference indicating a root element ca
+n be\n";
print "\tnaive in the case of very complicated xsd files with impo
+rts and\n";
print "\tother outside references.\n\n";
exit 0;
}
while(@myArgs)
{
my $inFileName=$myArgs[0];
my @pathMembers=splitpath($inFileName);
my $outFileName=$pathMembers[0].$pathMembers[1]."ReorgXSD.".$pathMembe
+rs[2];
my ($initFile,$fileHdr,$fileFtr);
{
local $/=undef;
open(INITFILE,"< $inFileName") or die "Could not open initializati
+on file '".$inFileName."': $!\n";
$initFile=<INITFILE>;
close(INITFILE);
}
my $comparator;
# normalize the parsing input:
# remove comments (cannot meaningfully interpolate these)
# remove the <?xml... and <xs:schema... header
# remove the </xs:schema... footer
# reduce newlines to single occurrences.
$comparator='(^(?:<\?xml.*?>)*.*?<(?:xs.*?|.*:)schema\b.+?>)';
$initFile=~m{$comparator}s;
$fileHdr=$1;
$initFile=$';
$comparator='(^</(?:xs.?|.*):?\bschema\b.*?>)';
$initFile=~m{$comparator}ms;
$fileFtr=$1;
$initFile=$`;
$comparator='\s*?<!--.*?-->';
$initFile=~s/$comparator//sg;
$comparator='(\r\n|[\r\n]){2,}';
$initFile=~s/$comparator/$1/g;
my @AllEntities;
my @AllElements;
my %UnReferenced;
my %ComplexElements;
my %SimpleElements;
my @xsImports;
my %ComplexEntities;
my %SimpleEntities;
my $fileIterator=$initFile;
my $elmDef;
my $elmName;
my $elmType;
my $tagType;
($fileIterator,$elmDef,$elmName,$elmType,$tagType)=getEntry($fileItera
+tor);
while($elmDef)
{
if($tagType eq 'complex')
{
if($elmType =~ m{(?:xs.*?|.*?):?element})
{
%ComplexElements=(%ComplexElements,$elmName=>$elmDef);
@AllEntities=(@AllEntities,$elmName);
@AllElements=(@AllElements,$elmName);
}
elsif($elmType =~ m{(?:xs.*?|.*?):?import})
{
@xsImports=(@xsImports,$elmDef);
}
elsif($elmName)
{
%ComplexEntities=(%ComplexEntities,$elmName=>$elmDef);
@AllEntities=(@AllEntities,$elmName);
}
}
else
{
if($elmType =~ m{(?:xs.*?|.*?):?element})
{
%SimpleElements=(%SimpleElements,$elmName=>$elmDef);
@AllEntities=(@AllEntities,$elmName);
@AllElements=(@AllElements,$elmName);
}
elsif($elmType =~ m{(?:xs.*?|.*?):?import})
{
@xsImports=(@xsImports,$elmDef);
}
elsif($elmName)
{
%SimpleEntities=(%SimpleEntities,$elmName=>$elmDef);
@AllEntities=(@AllEntities,$elmName);
}
}
($fileIterator,$elmDef,$elmName,$elmType,$tagType)=getEntry($fileI
+terator);
}
my $rootName='';
my @rootDefn;
foreach my $entity (sort @AllEntities)
{
my $refd=0;
foreach my $key (sort @AllElements)
{
next if $key eq $entity;
if(
($ComplexEntities{$key}=~m{((?:\b\w+\b)(?<!name)\s*?=\s*?(
+?:\"|\')(?:\w+?:)?\b$entity\b(?:\"|\'))}) ||
($ComplexElements{$key}=~m{((?:\b\w+\b)(?<!name)\s*?=\s*?(
+?:\"|\')(?:\w+?:)?\b$entity\b(?:\"|\'))}) ||
($SimpleEntities{$key}=~m{((?:\b\w+\b)(?<!name)\s*?=\s*?(?
+:\"|\')(?:\w+?:)?\b$entity\b(?:\"|\'))}) ||
($SimpleElements{$key}=~m{((?:\b\w+\b)(?<!name)\s*?=\s*?(?
+:\"|\')(?:\w+?:)?\b$entity\b(?:\"|\'))})
)
{
$refd=1;
last;
}
}
if(!$refd)
{
%UnReferenced=(%UnReferenced,$entity=>1);
}
}
foreach my $key (sort @AllElements)
{
if($UnReferenced{$key})
{
if($ComplexElements{$key})
{
@rootDefn=(@rootDefn,$ComplexElements{$key});
delete $ComplexElements{$key};
}
elsif($SimpleElements{$key})
{
@rootDefn=(@rootDefn,$SimpleElements{$key});
delete $SimpleElements{$key};
}
elsif($ComplexEntities{$key})
{
@rootDefn=(@rootDefn,$ComplexEntities{$key});
delete $ComplexEntities{$key};
}
elsif($SimpleEntities{$key})
{
@rootDefn=(@rootDefn,$SimpleEntities{$key});
delete $SimpleEntities{$key};
}
}
}
open(OUTFILE,"> $outFileName") or die "Could not open output file $!\n
+";
print OUTFILE $fileHdr."\n";
if(@xsImports)
{
print OUTFILE "\n<!-- imports (ReorgXSD) -->\n";
foreach my $entity (@xsImports)
{
print OUTFILE $entity."\n";
}
}
if(@rootDefn && @rootDefn == 1)
{
print OUTFILE "\n<!-- Root Element (ReorgXSD) -->\n";
print OUTFILE $rootDefn[0]."\n";
}
elsif(@rootDefn)
{
my $numRoots=@rootDefn;
print OUTFILE "\n<!-- No Root Element Found ($numRoots possible) (
+ReorgXSD) -->\n";
foreach my $defn (@rootDefn)
{
print OUTFILE $defn."\n";
}
}
else
{
print OUTFILE "\n<!-- No Root Element Found (ReorgXSD) -->\n";
}
if(%ComplexElements)
{
print OUTFILE "\n<!-- Complex elements (ReorgXSD) -->\n";
foreach my $key (sort keys %ComplexElements)
{
print OUTFILE $ComplexElements{$key}."\n";
}
}
if(%SimpleElements)
{
print OUTFILE "\n<!-- Simple elements (ReorgXSD) -->\n";
foreach my $key (sort keys %SimpleElements)
{
print OUTFILE $SimpleElements{$key}."\n";
}
}
if(%ComplexEntities)
{
print OUTFILE "\n<!-- other named complex entities (ReorgXSD) -->\
+n";
foreach my $key (sort keys %ComplexEntities)
{
print OUTFILE $ComplexEntities{$key}."\n";
}
}
if(%SimpleEntities)
{
print OUTFILE "\n<!-- other named simple entities (ReorgXSD) -->\n
+";
foreach my $key (sort keys %SimpleEntities)
{
print OUTFILE $SimpleEntities{$key}."\n";
}
}
if($fileIterator and $fileIterator !~ m{^\s*$})
{
print OUTFILE "\n<!-- misc entities (ReorgXSD) -->\n";
print OUTFILE $fileIterator;
}
print OUTFILE "\n";
print OUTFILE $fileFtr;
print OUTFILE "\n";
close(OUTFILE);
print "Results are in file '".$outFileName."'\n";
shift(@myArgs);
}
|
|
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: ReorgXSD.pl
by simon.proctor (Vicar) on Jul 11, 2006 at 09:31 UTC | |
by dsilvia (Initiate) on Jul 11, 2006 at 09:48 UTC | |
by dsilvia (Initiate) on Jul 11, 2006 at 10:22 UTC | |
by herveus (Prior) on Jul 11, 2006 at 11:47 UTC | |
by dsilvia (Initiate) on Jul 11, 2006 at 22:08 UTC |