wardy3 has asked for the wisdom of the Perl Monks concerning the following question:

Hi monks. I had a question here a while back about parsing some XML and I think I'll go with XML::LibXML because it's nice and fast.

However, I'm have a bit of trouble understanding the interface and can't find a nice tutorial to go through

I've got some XML code that looks like this:

<?xml version="1.0" standalone="yes" ?> <SymCLI_ML> <Symmetrix> <Symm_Info> <symid>000290101935</symid> </Symm_Info> <Device> <Dev_Info> <pd_name>Not Visible</pd_name> <dev_name>0040</dev_name> <configuration>RAID-5</configuration> <attached_bcv>N/A</attached_bcv> <emulation>CKD-3390</emulation> <status>Ready</status> <sa_status>N/A</sa_status> <service_state>Normal</service_state> <ssid>0xD800</ssid> <cuimage>0x00</cuimage> </Dev_Info> <Attached> <BCV>N/A</BCV> <VDEV>N/A</VDEV> </Attached> <Product> <vendor> </vendor> <name> </name> <revision> </revision> <serial_id>N/A</serial_id> <symid>000290101935</symid> </Product> <Label> <type>N/A</type> <defined_label>N/A</defined_label> </Label> <Flags> <ckd>True</ckd> <worm_enabled>False</worm_enabled> <worm_protected>False</worm_protected> <dynamic_spare_invoked>False</dynamic_spare_invoked> <dynamic_rdf_capability>None</dynamic_rdf_capability> <star_mode>False</star_mode> <star_recovery_capability>None</star_recovery_capability> <star_recovery_state>N/A</star_recovery_state> <radiant_managed>False</radiant_managed> <restricted_access_dev>False</restricted_access_dev> <rdb_checksum_enabled>False</rdb_checksum_enabled> <non_exclusive_access>False</non_exclusive_access> <scsi3_persist_res>Disabled</scsi3_persist_res> <vcm>False</vcm> <symmetrix_filesystem>False</symmetrix_filesystem> <snap_save_device>False</snap_save_device> <gatekeeper>False</gatekeeper> <meta>None</meta> </Flags> <Capacity> <block_size>56664</block_size> <cylinders>1113</cylinders> <tracks>16695</tracks> <blocks>16695</blocks> <megabytes>902</megabytes> <kilobytes>923833</kilobytes> </Capacity> <Front_End> <Port> <pd_name>Not Visible</pd_name> <director>03A</director> <director_type>FICON</director_type> <powerpath_type>N/A</powerpath_type> <port>0</port> <port_status>N/A</port_status> <tid>0</tid> <lun>0</lun> <host_lun>N/A</host_lun> <base_address>0</base_address> <alias_count>0</alias_count> </Port> <Port> <pd_name>Not Visible</pd_name> <director>04A</director> <director_type>FICON</director_type> <powerpath_type>N/A</powerpath_type> <port>0</port> <port_status>N/A</port_status> <tid>0</tid> <lun>0</lun> <host_lun>N/A</host_lun> <base_address>0</base_address> <alias_count>0</alias_count> </Port> <Port> <pd_name>Not Visible</pd_name> <director>13A</director> <director_type>FICON</director_type> <powerpath_type>N/A</powerpath_type> <port>0</port> <port_status>N/A</port_status> <tid>0</tid> <lun>0</lun> <host_lun>N/A</host_lun> <base_address>0</base_address> <alias_count>0</alias_count> </Port> <Port> <pd_name>Not Visible</pd_name> <director>14A</director> <director_type>FICON</director_type> <powerpath_type>N/A</powerpath_type> <port>0</port> <port_status>N/A</port_status> <tid>0</tid> <lun>0</lun> <host_lun>N/A</host_lun> <base_address>0</base_address> <alias_count>0</alias_count> </Port> </Front_End> <Mirror_Set> <Mirror> <number>1</number> <type>RAID-5</type> <status>Ready</status> <invalid_tracks>0</invalid_tracks> </Mirror> <Mirror> <number>2</number> <type>RAID-5</type> <status>Ready</status> <invalid_tracks>0</invalid_tracks> </Mirror> <Mirror> <number>3</number> <type>N/A</type> <status>N/A</status> <invalid_tracks>0</invalid_tracks> </Mirror> <Mirror> <number>4</number> <type>N/A</type> <status>N/A</status> <invalid_tracks>0</invalid_tracks> </Mirror> </Mirror_Set> <Back_End> <Hyper> <type>RAID-5</type> <status>Ready</status> <number>N/A</number> <Disk> <director>N/A</director> <interface>N/A</interface> <tid>N/A</tid> <volume_number>N/A</volume_number> </Disk> </Hyper> <Hyper> <type>RAID-5</type> <status>Ready</status> <number>N/A</number> <Disk> <director>N/A</director> <interface>N/A</interface> <tid>N/A</tid> <volume_number>N/A</volume_number> </Disk> </Hyper> </Back_End> <RAID-5_Device> <RAID5_Dev_Info> <tracks_per_stripe>4</tracks_per_stripe> <ready_state>ReadyNoOtherMirror</ready_state> <writeprotect_state>EnabledNoOtherMirror</writeprotect_state +> <member_num_of_failing_dev>None</member_num_of_failing_dev> <member_which_invoked_spare>None</member_which_invoked_spare +> <disk_director_num_which_owns_spare>-1</disk_director_num_wh +ich_owns_spare> <disk_director_ident_which_owns_spare>N/A</disk_director_ide +nt_which_owns_spare > <copy_direction>N/A</copy_direction> </RAID5_Dev_Info> <Hyper> <director>01A</director> <interface>D</interface> <tid>5</tid> <da_vol_num>444</da_vol_num> <hyper_num>56</hyper_num> <hyper_capacity_in_mb>307</hyper_capacity_in_mb> <member_num>4</member_num> <member_status>RW</member_status> <spare_status>N/A</spare_status> <disk_group_num>2</disk_group_num> <disk_capacity_in_mb>140014</disk_capacity_in_mb> </Hyper> <Hyper> <director>15A</director> <interface>D</interface> <tid>5</tid> <da_vol_num>468</da_vol_num> <hyper_num>56</hyper_num> <hyper_capacity_in_mb>307</hyper_capacity_in_mb> <member_num>1</member_num> <member_status>RW</member_status> <spare_status>N/A</spare_status> <disk_group_num>2</disk_group_num> <disk_capacity_in_mb>140014</disk_capacity_in_mb> </Hyper> <Hyper> <director>02C</director> <interface>C</interface> <tid>5</tid> <da_vol_num>66</da_vol_num> <hyper_num>56</hyper_num> <hyper_capacity_in_mb>307</hyper_capacity_in_mb> <member_num>3</member_num> <member_status>RW</member_status> <spare_status>N/A</spare_status> <disk_group_num>2</disk_group_num> <disk_capacity_in_mb>140014</disk_capacity_in_mb> </Hyper> <Hyper> <director>16C</director> <interface>C</interface> <tid>5</tid> <da_vol_num>66</da_vol_num> <hyper_num>56</hyper_num> <hyper_capacity_in_mb>307</hyper_capacity_in_mb> <member_num>2</member_num> <member_status>RW</member_status> <spare_status>N/A</spare_status> <disk_group_num>2</disk_group_num> <disk_capacity_in_mb>140014</disk_capacity_in_mb> </Hyper> </RAID-5_Device> </Device>

The Device tag repeats over 4000 times in this file. I want to extract certain fields out of the XML and build a simple file, one line per device.

I've tried different methods of processing this tree but I think I'm just missing the point somewhere. Here are some examples I've written but I'm not sure I'm on the right track,

my $parser = new XML::LibXML; my $tree = $parser->parse_file('935.xml'); my $root = $tree->getDocumentElement; my @devices = $root->findnodes('/SymCLI_ML/Symmetrix/Device/Dev_Info') +; for my $device_id ( @devices ) { my $dev_name = $device_id->findnodes('./Dev_Name'); my $dev_conf = $device_id->findnodes('./configuration'); print $dev_name->to_literal, "\t", $dev_conf->to_literal, "\n"; }

but this is very slow.

So I tried a different approach, using the getChildrenByTagName method, which works a lot faster

my $parser = new XML::LibXML; my $tree = $parser->parse_file('935.xml'); my $root = $tree->getDocumentElement; my @symm_tree = $root->getChildrenByTagName('Symmetrix') or croak; for my $symm_tree ( @symm_tree ) { my @devices_tree = $symm_tree->getChildrenByTagName('Device') or c +roak; my $device_num; for my $device_tree ( @devices_tree ) { $device_num++; my @devinfo_tree = $device_tree->getChildrenByTagName('Dev_ +Info') or croak; my @capacity_tree = $device_tree->getChildrenByTagName('C +apacity') or croak; my $dev_name; # key for my $devinfo_tree ( @devinfo_tree ) { $dev_name = $devinfo_tree->getChildrenByTagName('dev_name' +); my $dev_cnfg = $devinfo_tree->getChildrenByTagName('config +uration'); $conf_of{$dev_name} .= $dev_cnfg; } for my $capacity_tree ( @capacity_tree ) { my $cyls = $capacity_tree->getChildrenByTagName('cylinders +'); $conf_of{$dev_name} .= $cyls; } } }

Am I getting close to how the module should be used? I'm not sure why method 1 is so slow, as it seems XPath-style statements could be very useful.

I actually need to extract about 30 fields from the XML so I'm concerned that method 2is not necessarily scalable to that sort of usage, and would very much appreciate any feedback!

Thanks for reading this far

~ Michael

Replies are listed 'Best First'.
Re: Any help available for a newbie to XML::LibXML?
by Your Mother (Archbishop) on Feb 29, 2008 at 04:55 UTC

    I started with a couple other XML packages way back when and was never happy. I accidentally discovered LibXML and after fighting through the docs (they actually are pretty good, you just have to read them all front to back and then use them like a function index) I've never been happier. So stick it out, as it were. My Xpath sucks but here's something anyway.

    First, there's a good basic tutorial here: Xpath basics. I use that site over and over because their tutorials are straightforward and well organized even when they aren't the deepest/best. Then, this is what I tried just to play around.

    use XML::LibXML; my $parser = XML::LibXML->new(); my $tree = $parser->parse_fh(*DATA); my $root = $tree->getDocumentElement; for my $node ( $root->findnodes("//Device/*/status/text()") ) { print $node->nodeValue, $/; } # print $root->serialize(1); __DATA__ <?xml version="1.0" standalone="yes" ?> <SymCLI_ML> <Symmetrix> <Symm_Info> <symid>000290101935</symid> </Symm_Info> <Device> <Dev_Info> <pd_name>Not Visible</pd_name> <dev_name>0040</dev_name> # ...

    To iterate through all the Device childNodes, this sort of thing should be pretty fast. I think there is a native Xpath for getting child nodes too.

    for my $node ( $root->findnodes("//Device/*") ) { print $node->nodeName, $/; }

    You could certainly cook up a hash of the nodeNames/Xpath to "your key names" you want and run your Xpath queries with it so that your solution would (human) scale as it would be little more than a configuration based filter.

    (update: corrected an extra word and code comment)

      Thanks for the reply

      You've given me some things to think about :-) I'll have a play next week at work

      Hi again

      I've written a bit more code to test it out and it is very slow doing the findnodes when I'm already at a position in the tree.

      I've written this test script:

      my $parser = XML::LibXML->new(); my $doc = $parser->parse_file($filename); my @devinfo = $doc->findnodes('//Device/Dev_Info'); foreach my $devinfo (@devinfo) { my($dev) = $devinfo->findnodes('./dev_name'); print $dev->to_literal, "\n"; }

      The first findnodes runs very quickly (< 1 sec) but the one in the loop is a lot slower. But I don't know why. Doesn't it just have to examine the nodes "under" the current devinfo tag?

      Am I doing something wrong with the findnodes call here?

      Thanks, ~ Michael

        The only thing that I see from looking at your code that might be responsible would be that

        my($dev) = $devinfo->findnodes('./dev_name');

        actually finds many nodes just to throw them all away except for the first. Maybe do it this way to check how many nodes you find:

        my @found_devices = $devinfo->findnodes('./dev_name'); warn "Found " . scalar(@found_devices); my $dev = $found_devices[0];

        But that's just a shot in the dark. Actually looking at your XML, it shouldn't find more than one device.

Re: Any help available for a newbie to XML::LibXML?
by Jenda (Abbot) on Mar 03, 2008 at 19:39 UTC

    If you don't insist on using XML::LibXML you could do something like this:

    use strict; use XML::Rules; my $parser = XML::Rules->new( stripspaces => 7, rules => [ '_default' => 'content', 'Dev_Info' => sub { print "$_[1]->{dev_name}\t$_[1]->{configuration}\n"; return; }, 'Device' => '', ], start_rules => [ 'Hyper,RAID-5_Device,Back_End,Mirror_Set,Front_End,Product,Lab +el,Flags,Capacity' => 'skip', ], ); $parser->parse(\*DATA); __DATA__ <?xml version="1.0" standalone="yes" ?> <SymCLI_ML> ...
    or (to match the second example)
    use strict; use XML::Rules; my %conf_of; my $parser = XML::Rules->new( stripspaces => 7, rules => [ '_default' => 'content', 'Device' => sub { $conf_of{$_[1]->{Dev_Info}{dev_name}} = "$_[1]->{Dev_Info} +{configuration},$_[1]->{Capacity}{cylinders}"; return; }, 'Dev_Info,Capacity' => 'no content', ], start_rules => [ 'Hyper,RAID-5_Device,Back_End,Mirror_Set,Front_End,Product,Lab +el,Flags' => 'skip', ], ); $parser->parse(\*DATA); use Data::Dumper; print Dumper(\%conf_of); __DATA__ <?xml version="1.0" standalone="yes" ?> <SymCLI_ML> ...
    or to get rid of the global variable something like
    use strict; use XML::Rules; my %conf_of; my $parser = XML::Rules->new( stripspaces => 7, rules => [ '_default' => 'content', 'Device' => sub { return $_[1]->{Dev_Info}{dev_name} => "$_[1]->{Dev_Info}{c +onfiguration},$_[1]->{Capacity}{cylinders}"; }, 'Dev_Info,Capacity' => 'no content', 'Symm_Info' => sub {return symid => $_[1]->{symid}}, 'Symmetrix' => 'no content', 'SymCLI_ML' => 'pass', ], start_rules => [ 'Hyper,RAID-5_Device,Back_End,Mirror_Set,Front_End,Product,Lab +el,Flags' => 'skip', ], ); my $conf = $parser->parse(\*DATA); use Data::Dumper; print Dumper($conf); __DATA__ <?xml version="1.0" standalone="yes" ?> <SymCLI_ML> ...

    Your requirements do look like something that XML::Rules was designed for.

      Thanks, Jenda. I had a play with your module. It's quite interesting and pretty fast.

      At the moment, I've been playing with XSLT, which I started learning a few years ago but stopped using it. It's performing very fast too, so I'm going to go down that road for now.

      But that you very much for your suggestions - I've created some working test scripts based on your code - and I'll look into XML::Rules when I'm back in the XML world again :-)