in reply to Parsing generic XML
I prefer "low level" xml modules like XML::Parser and XML::LibXML, because for some reason I find that they are actually easier for me to learn, compared to the "refined sugar" approaches like XML::Simple and XML::Twig; I don't mind writing a few extra lines of code, given that I'm able to understand more quickly what the code is really doing.
As for going beyond simple summarization and updating content, I think LibXML would be the tool I'd prefer.
#!/usr/bin/perl use strict; use XML::Parser; my $Usage = "$0 [-r] [-b] file.xml\n"; my ( $add_root, $count_attribs, $discrete_count ); while ( @ARGV > 1 and $ARGV[0] =~ /^-([abr])$/ ) { if ( $1 eq 'r' ) { $add_root = shift; } elsif ( $1 eq 'a' ) { $count_attribs = shift; } else { $discrete_count = shift; } } die $Usage unless ( @ARGV == 1 and -f $ARGV[0] ); my %embedding; my $key = ''; my %ehist; my %ahist; my $p = XML::Parser->new( Handlers => { Start => sub{ my $newkey = "$key/$_[1]"; if ( $key and $discrete_coun +t and !exists( $embedding{$ke +y} )) { $embedding{$key}++; $ehist{$key}--; } $key = $newkey; $ehist{$key}++; if ( $count_attribs ) { for ( my $i=2; $i<$#_; $ +i+=2 ) { $ahist{$key}{$_[$i]} +++; } } }, End => sub{ delete $embedding{$key} if ( + $discrete_count ); $key =~ s{/$_[1]$}{} }, } ); if ( ! $add_root ) { $p->parsefile( $ARGV[0] ); } else { my $xmlstr = "<STRUCT_HIST_ROOT_$$>\n"; open( X, '<:utf8', $ARGV[0] ) or die "Unable to read $ARGV[0]: $!\ +n"; { local $/ = undef; $xmlstr .= <X>; } close X; $xmlstr .= "</STRUCT_HIST_ROOT_$$>"; $p->parse( $xmlstr ); } for my $k ( sort keys %ehist ) { $_ = $k; if ( $add_root ) { s{/STRUCT_HIST_ROOT_$$}{}; next unless /\S/; } next if ( $discrete_count and $ehist{$k} <= 0 ); print "$ehist{$k}\t$_\n"; if ( $count_attribs ) { print "\t$ahist{$k}{$_}\t\@$_\n" for ( sort keys %{$ahist{$k}} + ); } } =head1 NAME xml-structure-hist =head1 SYNOPSIS xml-structure-hist [-r] [-a] [-b] file.xml -r : have the program supply a root node tag -a : tabulate element attributes (only on raw element counts) -b : count only "bottom-level" paths (def: also count intermed.paths + ) =head1 DESCRIPTION For any given xml file, this tool will use a standard xml parser to tabulate the structure of the tags and print (on STDOUT) a tally of how many times each distinct structural element occurs in the file. Use the "-r" option if the input file does not include its own "root" xml tag (e.g. when multiple blocks of similar xml data are concatenate +d without a wrapper tag being put around them). For example, given an xml file with these contents: <root_node> <level1 id="x"> <level2_a><level3 x="y">...</level3><level3>...</level3></level2_a> <level2_a><level3 x="z">...</level3><level3>...</level3></level2_a> </level1> <level1 id="y"> <level2_a><level3 x="w"><level4>...</level4>...</level3></level2_a> <level2_b><level3 x="x">...</level3></level2_b> </level1> <level1 id="z"> <level2_a>...</level2_a> </level1> </root_node> the default output would be: 1 /root_node 3 /root_node/level1 4 /root_node/level1/level2_a 5 /root_node/level1/level2_a/level3 1 /root_node/level1/level2_a/level3/level4 1 /root_node/level1/level2_b 1 /root_node/level1/level2_b/level3 With tha "-a" option, the output would be: 1 /root_node 3 /root_node/level1 3 @id 4 /root_node/level1/level2_a 5 /root_node/level1/level2_a/level3 3 @x 1 /root_node/level1/level2_a/level3/level4 1 /root_node/level1/level2_b 1 /root_node/level1/level2_b/level3 1 @x With the "-b" option, the output would be: 1 /root_node/level1/level2_a 4 /root_node/level1/level2_a/level3 1 /root_node/level1/level2_a/level3/level4 1 /root_node/level1/level2_b/level3 If the example lacked the "root_node" tags, you would use the "-r" option, and the quantities reported for the "level*" tags would be the same as above. =head1 AUTHOR David Graff <graff at ldc.upenn.edu> =cut
|
|---|