Does this look right? (You may need to sort or uniq the gene names).
C:\test>774383.pl Gene1:Gene2 tags:: Group both under new tag: A Gene3:Gene1 tags::A Group first with second under A Gene4:Gene1 tags::A Group first with second under A Gene2:Gene3 tags:A:A Both grouped; add spcs Gene2:Gene4 tags:A:A Both grouped; add spcs Gene3:Gene4 tags:A:A Both grouped; add spcs GeneA:GeneB tags:: Group both under new tag: B GeneB:GeneC tags:B: Group second with first under B GeneC:GeneD tags:B: Group second with first under B GeneD:GeneE tags:B: Group second with first under B GeneE:GeneF tags:B: Group second with first under B GeneX:GeneY tags:: Group both under new tag: C GeneX:GeneP tags:C: Group second with first under C { A => { genes => ["Gene1", "Gene2", "Gene3", "Gene4"], spcs => [ "spc1", "spc2", "spc1", "spc2", "spc4", "spc1", "spc2", " +spc5", "spc3", "spc1", "spc1", "spc2", "spc2", "spc3", "spc1", " +spc2", ], }, B => { genes => ["GeneA", "GeneB", "GeneC", "GeneD", "GeneE", "GeneF +"], spcs => [ "spc4", "spc5", "spc1", "spc2", "spc1", "spc2", "spc4", "spc2", "spc3", "spc1", ], }, C => { genes => ["GeneX", "GeneY", "GeneP"], spcs => ["spc6", "spc8", "spc6", "spc7"], }, }
The code:
#! perl -s use 5.010; use strict; use Data::Dump qw[ pp ]; =comment My desired Output is Gene1,Gene2,Gene3,Gene4,spc1,spc2,spc1,spc2,spc4,spc1,spc2,spc5,spc1,s +pc2,spc2,spc3,spc1,spc2 GeneA,GeneB,GeneC,GeneD,GeneE,GeneF,spc4,spc5,spc1,spc2,spc1,spc2,spc4 +,spc2,spc3,spc1 GeneX,GeneY,GeneP,,spc6,spc8,spc6,spc7 =cut my( $nextTag, %gTags, %groups ) = 'A'; while( <DATA> ) { chomp; my( $g1, $g2, @spcs ) = split ','; say "$g1:$g2 tags:$gTags{ $g1 }:$gTags{$g2 }"; <>; if( exists $gTags{ $g1 } and exists $gTags{ $g2 } ){ if( $gTags{ $g1 } eq $gTags{ $g2 } ) { say "Both grouped; add spcs"; ## Just add new spcs to existing push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @spcs; } else { ## merge groups say "merge $gTags{ $g1 }:$gTags{$g2 }"; ## Add the current info to the first gene group. push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @spcs; push @{ $groups{ $gTags{ $g1 } }{ genes } }, $g2; ## Move the info from the seconds gene group to the first push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @{ $groups{ $gTags{ $g2 } }{ spcs } }; push @{ $groups{ $gTags{ $g1 } }{ genes } }, @{ $groups{ $gTags{ $g2 } }{ genes } }; ## delete the second gene group $groups{ $gTags{ $g2 } } = undef; ## And re-tag it to be the first $gTags{ $g2 } = $gTags{ $g1 }; } } elsif( exists $gTags{ $g1 } and not exists $gTags{ $g2 } ) { say "Group second with first under $gTags{ $g1 }"; $gTags{ $g2 } = $gTags{ $g1 }; push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @spcs; push @{ $groups{ $gTags{ $g1 } }{ genes } }, $g2; } elsif( exists $gTags{ $g2 } and not exists $gTags{ $g1 } ) { say "Group first with second under $gTags{ $g2 }"; $gTags{ $g1 } = $gTags{ $g2 }; push @{ $groups{ $gTags{ $g2 } }{ spcs } }, @spcs; push @{ $groups{ $gTags{ $g2 } }{ genes } }, $g1; } else { ## neither yet exist so group them under a new tag my $tag = $nextTag++; say "Group both under new tag: $tag"; $gTags{ $g1 } = $gTags{ $g2 } = $tag; push @{ $groups{ $tag }{ spcs } }, @spcs; push @{ $groups{ $tag }{ genes } }, $g1, $g2; } } pp \%groups; __DATA__ Gene1,Gene2,spc1,spc2 Gene3,Gene1,spc1,spc2,spc4 Gene4,Gene1,spc1,spc2,spc5,spc3,spc1 Gene2,Gene3,spc1,spc2 Gene2,Gene4,spc2,spc3 Gene3,Gene4,spc1,spc2 GeneA,GeneB,spc4,spc5 GeneB,GeneC,spc1,spc2 GeneC,GeneD,spc1,spc2 GeneD,GeneE,spc4,spc2 GeneE,GeneF,spc3,spc1 GeneX,GeneY,spc6,spc8 GeneX,GeneP,spc6,spc7
In reply to Re: Clustering with Perl
by BrowserUk
in thread Clustering with Perl
by nerve
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |