matth has asked for the wisdom of the Perl Monks concerning the following question:

Dear Monks,

My eyes are telling me that I have to confess and declare my debugging failings.

I have simplified the XML attributes slightly (for the posting) and I therefore present you with a code of:
#!/usr/bin/perl -w ## see perl monks documentation for details on who provided this code ## All ErrorContexts added 9_12_02 use strict; use XML::Twig; #my $data = "perl_monks_data_test.txt"; #open (DATA, "<$data"); #my $out_merged = "out_merged_test.txt"; #open (OUT, "+>>$out_merged"); #my $data = "../5_12_02/perl_monks_data.txt"; my $data = "adjusted_new_output_xml_k_testC.txt"; open (DATA, "<$data"); my $out_merged = "out_adjusted_new_output_xml_k.txt"; open (OUT, "+>>$out_merged"); $/="\n\n"; # tag => attribute we are interested in # you could avoid having this global by putting it in the twig my %att= ( species => 'id', genome_feature => 'type', gene => 'id', gene_seq => 'id', ErrorContext => 1 ); my $doc = <DATA>; # the original data set my $expected_sorted_doc = <DATA>; # sorted result my $expected_merged_doc = <DATA>; # merged result my $sorted_doc= sort_doc( $doc); if( compact( $sorted_doc) eq compact( $expected_sorted_doc)) { print "sorted doc generation OK\n"; } else { print "sorted doc generation NOK: \n", "expected:\n$expected_sorted_doc\n", "found:\n$sorted_doc\n"; } my $merged_doc= merge_doc( $doc); if( compact( $merged_doc) eq compact( $expected_merged_doc)) { print "merged doc generation OK\n"; print OUT "$expected_merged_doc"; # print OUT "$merged_doc\n"; } else { print "merged doc generation NOK: \n", "expected:\n$expected_merged_doc\n", "found:\n$merged_doc\n"; } # sort: for each relevant node (elt or subelt in this case) generate a + location # key and move the content if the location already exists sub sort_doc { my( $doc)= @_; my $location={}; # location key => existing subseq element with t +his location key my $t= XML::Twig->new( twig_handlers => { species => sub { sor +t_node( $location, @_); }, genome_feature => sub { sort_node( $location, @_); }, gene => sub { sort_node( $location, @_); }, gene_seq => sub { sort_node( $location, @_); } # ErrorContext => 1 }, pretty_print => 'indented', # makes debuggi +ng easier ); $t->parse( $doc); return $t->sprint; } sub sort_node { my( $location, $t, $node)= @_; # compute the location key, which must describe uniquely the node +category my $location_key= location( $node); # now see if we need to move the content if( my $new_parent= $location->{$location_key}) { # there is already an element with this location key # move all content's there foreach my $content ($node->children) { $content->move( last_child => $new_parent);} # no need to keep the empty shell $node->delete unless( $node->has_child); } else { # first time we see the location key, store the element in $lo +cation $location->{$location_key}= $node; } } # the location describes a node category, nodes with the same location + should have the same parent sub location { my( $node)= @_; # a compact way to just join the values of the proper attributes o +f the ancestors of the node my $location= join( '-', grep {$_} map { $_->att( $att{$_->tag}) | +| '' } (@{[$node->ancestors]}, $node)); #warn "location: $location\n"; return $location; } # merge doc sub merge_doc { my( $doc)= @_; my $t= XML::Twig->new( twig_handlers => { species => \&merge_node +, genome_feature => \&merge_node, gene => \&merge_node, gene_seq => \&merge_node # ErrorContext => 1 }, pretty_print => 'indented', ); $t->parse( $doc); return $t->sprint; } sub merge_node { my( $t, $node)= @_; my $potential_merger= $node->prev_elt( $node->tag) or return; # re +turn if this is the first node of this type if( location( $node) eq location( $potential_merger)) { # bingo! we can merge the contents foreach my $content ($node->children) { $content->move( last_child => $potential_merger); } } else { # this branch is not used for this test as we are working in m +emory # but this is where you could free the memory by dumping the p +art of # the tree that will no longer need to be updated # $t->flush_up_to( $potential_merger); } $node->delete unless( $node->has_child); } sub compact { my( $doc)= @_; $doc=~ s{^\s+}{}; # trim at the begining $doc=~ s{\s+$}{}; # trim the end $doc=~ s{>\s*<}{><}g; # trim spaces between tags return $doc; } DATA ____________________________ <species id="1"> <genome_feature type="CDS"> <gene id="1"> <gene_seq id="1"></gene_seq> </gene> </genome_feature> </species> <species id="1"> <genome_feature type="CDS"> <gene id="2"> <gene_seq id="2"></gene_seq> </gene> </genome_feature> </species> <species id="1"> <genome_feature type="CDS"> <gene id="3"> <gene_seq id="3"> </gene_seq> </gene> </genome_feature> </species> ERROR message ___________________________ bash-2.05$ perl perl_monks_F_test_adjusted.pl sorted doc generation NOK: expected: <species id="1"> <genome_feature type="CDS"> <gene id="2"> <gene_seq id="2"></gene_seq> </gene> </genome_feature> </species> found: <species id="1"> <genome_feature type="CDS"> <gene id="1"> <gene_seq id="1"></gene_seq> </gene> </genome_feature> </species> merged doc generation NOK: expected: <species id="1"> <genome_feature type="CDS"> <gene id="3"> <gene_seq id="3"> </gene_seq> </gene> </genome_feature> </species> found: <species id="1"> <genome_feature type="CDS"> <gene id="1"> <gene_seq id="1"></gene_seq> </gene> </genome_feature> </species> bash-2.05$
Can anyone please tell me why the program does not combine the repeating (overlapping) nodes? The program works well in the example shown for the node: Re: Removing duplicate subtrees from XML..