Return to the Twig question

matth has asked for the wisdom of the Perl Monks concerning the following question:
Dear Monks,
My eyes are telling me that I have to confess and declare my debugging failings.
I have simplified the XML attributes slightly (for the posting) and I therefore present you with a code of:

#!/usr/bin/perl -w

## see perl monks documentation for details on who provided this code
## All  ErrorContexts added 9_12_02

use strict;

use XML::Twig;


#my $data = "perl_monks_data_test.txt";
#open (DATA, "<$data");

#my $out_merged = "out_merged_test.txt";
#open (OUT, "+>>$out_merged");


#my $data = "../5_12_02/perl_monks_data.txt";

my $data = "adjusted_new_output_xml_k_testC.txt";
open (DATA, "<$data");

my $out_merged = "out_adjusted_new_output_xml_k.txt";
open (OUT, "+>>$out_merged");


$/="\n\n";

# tag => attribute we are interested in
# you could avoid having this global by putting it in the twig
my %att= ( species => 'id',
    genome_feature => 'type',
    gene => 'id',
    gene_seq => 'id',
    ErrorContext => 1
         );

my $doc                 = <DATA>;  # the original data set
my $expected_sorted_doc = <DATA>;  # sorted result  
my $expected_merged_doc = <DATA>;  # merged result

my $sorted_doc= sort_doc( $doc);
if( compact( $sorted_doc) eq compact( $expected_sorted_doc))
  { print "sorted doc generation OK\n"; }
else
  { print "sorted doc generation NOK: \n",
          "expected:\n$expected_sorted_doc\n",
          "found:\n$sorted_doc\n";
  }

my $merged_doc= merge_doc( $doc);
if( compact( $merged_doc) eq compact( $expected_merged_doc))
  { print "merged doc generation OK\n"; 
    print OUT "$expected_merged_doc";
   # print OUT "$merged_doc\n";
}
else
  { print "merged doc generation NOK: \n",
          "expected:\n$expected_merged_doc\n",
          "found:\n$merged_doc\n";
  }


# sort: for each relevant node (elt or subelt in this case) generate a
+ location
# key and move the content if the location already exists

sub sort_doc
  { my( $doc)= @_;
    my $location={};  # location key => existing subseq element with t
+his location key
    my $t= XML::Twig->new( twig_handlers => {    species =>  sub { sor
+t_node( $location, @_); },
        genome_feature => sub { sort_node( $location, @_); },
        gene => sub { sort_node( $location, @_); },    
        gene_seq => sub { sort_node( $location, @_); }
#        ErrorContext => 1
       },
                           pretty_print => 'indented', # makes debuggi
+ng easier
                         );
    $t->parse( $doc);
    return $t->sprint;
  } 

sub sort_node
  { my( $location, $t, $node)= @_;

    # compute the location key, which must describe uniquely the node 
+category
    my $location_key= location( $node);

    # now see if we need to move the content
    if( my $new_parent= $location->{$location_key})
      { # there is already an element with this location key
        # move all content's there
        foreach my $content ($node->children)
          { $content->move( last_child => $new_parent);}
        # no need to keep the empty shell
        $node->delete unless( $node->has_child);
      }
    else
      { # first time we see the location key, store the element in $lo
+cation
        $location->{$location_key}= $node;
      }
  }

# the location describes a node category, nodes with the same location
+ should have the same parent
sub location
  { my( $node)= @_;
    # a compact way to just join the values of the proper attributes o
+f the ancestors of the node
    my $location= join( '-', grep {$_} map { $_->att( $att{$_->tag}) |
+| '' } (@{[$node->ancestors]}, $node));
    #warn "location: $location\n";
    return $location;
  }


# merge doc 

sub merge_doc
  { my( $doc)= @_;
    my $t= XML::Twig->new( twig_handlers => {  species => \&merge_node
+,
      genome_feature =>  \&merge_node,
      gene =>     \&merge_node,
      gene_seq => \&merge_node
     # ErrorContext => 1
                                            },
                           pretty_print => 'indented',
                         );
    $t->parse( $doc);
    return $t->sprint;
  } 

sub merge_node
  { my( $t, $node)= @_;
    my $potential_merger= $node->prev_elt( $node->tag) or return; # re
+turn if this is the first node of this type 
    if( location( $node) eq location( $potential_merger))
      { # bingo! we can merge the contents
        foreach my $content ($node->children)
          { $content->move( last_child => $potential_merger); }
      }
    else
      { # this branch is not used for this test as we are working in m
+emory
        # but this is where you could free the memory by dumping the p
+art of
        # the tree that will no longer need to be updated
        # $t->flush_up_to( $potential_merger);
      }

    $node->delete unless( $node->has_child);
  }

sub compact
  { my( $doc)= @_;
    $doc=~ s{^\s+}{};     # trim at the begining
    $doc=~ s{\s+$}{};     # trim the end
    $doc=~ s{>\s*<}{><}g; # trim spaces between tags
    return $doc;
  }






DATA ____________________________


<species id="1">
 <genome_feature type="CDS">
  <gene id="1">
   <gene_seq id="1"></gene_seq>
  </gene>
 </genome_feature>
</species>

<species id="1">
 <genome_feature type="CDS">
  <gene id="2">
   <gene_seq id="2"></gene_seq>
  </gene>
 </genome_feature>
</species>

<species id="1">
 <genome_feature type="CDS">
  <gene id="3">
   <gene_seq id="3">
   </gene_seq>
  </gene>
 </genome_feature>
</species>




ERROR message ___________________________

bash-2.05$ perl perl_monks_F_test_adjusted.pl
sorted doc generation NOK:
expected:
<species id="1">
 <genome_feature type="CDS">
  <gene id="2">
   <gene_seq id="2"></gene_seq>
  </gene>
 </genome_feature>
</species>


found:
<species id="1">
  <genome_feature type="CDS">
    <gene id="1">
      <gene_seq id="1"></gene_seq>
    </gene>
  </genome_feature>
</species>

merged doc generation NOK:
expected:
<species id="1">
 <genome_feature type="CDS">
  <gene id="3">
   <gene_seq id="3">
   </gene_seq>
  </gene>
 </genome_feature>
</species>
found:
<species id="1">
  <genome_feature type="CDS">
    <gene id="1">
      <gene_seq id="1"></gene_seq>
    </gene>
  </genome_feature>
</species>

bash-2.05$
[download]
Can anyone please tell me why the program does not combine the repeating (overlapping) nodes? The program works well in the example shown for the node: Re: Removing duplicate subtrees from XML..
Comment on Return to the Twig question Download Code