matth has asked for the wisdom of the Perl Monks concerning the following question:
My eyes are telling me that I have to confess and declare my debugging failings.
Can anyone please tell me why the program does not combine the repeating (overlapping) nodes? The program works well in the example shown for the node: Re: Removing duplicate subtrees from XML..#!/usr/bin/perl -w ## see perl monks documentation for details on who provided this code ## All ErrorContexts added 9_12_02 use strict; use XML::Twig; #my $data = "perl_monks_data_test.txt"; #open (DATA, "<$data"); #my $out_merged = "out_merged_test.txt"; #open (OUT, "+>>$out_merged"); #my $data = "../5_12_02/perl_monks_data.txt"; my $data = "adjusted_new_output_xml_k_testC.txt"; open (DATA, "<$data"); my $out_merged = "out_adjusted_new_output_xml_k.txt"; open (OUT, "+>>$out_merged"); $/="\n\n"; # tag => attribute we are interested in # you could avoid having this global by putting it in the twig my %att= ( species => 'id', genome_feature => 'type', gene => 'id', gene_seq => 'id', ErrorContext => 1 ); my $doc = <DATA>; # the original data set my $expected_sorted_doc = <DATA>; # sorted result my $expected_merged_doc = <DATA>; # merged result my $sorted_doc= sort_doc( $doc); if( compact( $sorted_doc) eq compact( $expected_sorted_doc)) { print "sorted doc generation OK\n"; } else { print "sorted doc generation NOK: \n", "expected:\n$expected_sorted_doc\n", "found:\n$sorted_doc\n"; } my $merged_doc= merge_doc( $doc); if( compact( $merged_doc) eq compact( $expected_merged_doc)) { print "merged doc generation OK\n"; print OUT "$expected_merged_doc"; # print OUT "$merged_doc\n"; } else { print "merged doc generation NOK: \n", "expected:\n$expected_merged_doc\n", "found:\n$merged_doc\n"; } # sort: for each relevant node (elt or subelt in this case) generate a + location # key and move the content if the location already exists sub sort_doc { my( $doc)= @_; my $location={}; # location key => existing subseq element with t +his location key my $t= XML::Twig->new( twig_handlers => { species => sub { sor +t_node( $location, @_); }, genome_feature => sub { sort_node( $location, @_); }, gene => sub { sort_node( $location, @_); }, gene_seq => sub { sort_node( $location, @_); } # ErrorContext => 1 }, pretty_print => 'indented', # makes debuggi +ng easier ); $t->parse( $doc); return $t->sprint; } sub sort_node { my( $location, $t, $node)= @_; # compute the location key, which must describe uniquely the node +category my $location_key= location( $node); # now see if we need to move the content if( my $new_parent= $location->{$location_key}) { # there is already an element with this location key # move all content's there foreach my $content ($node->children) { $content->move( last_child => $new_parent);} # no need to keep the empty shell $node->delete unless( $node->has_child); } else { # first time we see the location key, store the element in $lo +cation $location->{$location_key}= $node; } } # the location describes a node category, nodes with the same location + should have the same parent sub location { my( $node)= @_; # a compact way to just join the values of the proper attributes o +f the ancestors of the node my $location= join( '-', grep {$_} map { $_->att( $att{$_->tag}) | +| '' } (@{[$node->ancestors]}, $node)); #warn "location: $location\n"; return $location; } # merge doc sub merge_doc { my( $doc)= @_; my $t= XML::Twig->new( twig_handlers => { species => \&merge_node +, genome_feature => \&merge_node, gene => \&merge_node, gene_seq => \&merge_node # ErrorContext => 1 }, pretty_print => 'indented', ); $t->parse( $doc); return $t->sprint; } sub merge_node { my( $t, $node)= @_; my $potential_merger= $node->prev_elt( $node->tag) or return; # re +turn if this is the first node of this type if( location( $node) eq location( $potential_merger)) { # bingo! we can merge the contents foreach my $content ($node->children) { $content->move( last_child => $potential_merger); } } else { # this branch is not used for this test as we are working in m +emory # but this is where you could free the memory by dumping the p +art of # the tree that will no longer need to be updated # $t->flush_up_to( $potential_merger); } $node->delete unless( $node->has_child); } sub compact { my( $doc)= @_; $doc=~ s{^\s+}{}; # trim at the begining $doc=~ s{\s+$}{}; # trim the end $doc=~ s{>\s*<}{><}g; # trim spaces between tags return $doc; } DATA ____________________________ <species id="1"> <genome_feature type="CDS"> <gene id="1"> <gene_seq id="1"></gene_seq> </gene> </genome_feature> </species> <species id="1"> <genome_feature type="CDS"> <gene id="2"> <gene_seq id="2"></gene_seq> </gene> </genome_feature> </species> <species id="1"> <genome_feature type="CDS"> <gene id="3"> <gene_seq id="3"> </gene_seq> </gene> </genome_feature> </species> ERROR message ___________________________ bash-2.05$ perl perl_monks_F_test_adjusted.pl sorted doc generation NOK: expected: <species id="1"> <genome_feature type="CDS"> <gene id="2"> <gene_seq id="2"></gene_seq> </gene> </genome_feature> </species> found: <species id="1"> <genome_feature type="CDS"> <gene id="1"> <gene_seq id="1"></gene_seq> </gene> </genome_feature> </species> merged doc generation NOK: expected: <species id="1"> <genome_feature type="CDS"> <gene id="3"> <gene_seq id="3"> </gene_seq> </gene> </genome_feature> </species> found: <species id="1"> <genome_feature type="CDS"> <gene id="1"> <gene_seq id="1"></gene_seq> </gene> </genome_feature> </species> bash-2.05$
|
|---|