nerve has asked for the wisdom of the Perl Monks concerning the following question:

Hi there fellow mongers, I am working on a clustering process. I have to parse a file on basis if relativeness. So basically in terms of a example what I want to do is

Input file :example.txt

Gene1,Gene2,spc1,spc2
Gene3,Gene1,spc1,spc2,spc4
Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
Gene2,Gene3,spc1,spc2
Gene2,Gene4,spc2,spc3
Gene3,Gene4,spc1,spc2
GeneA,GeneB,spc4,spc5
GeneB,GeneC,spc1,spc2
GeneC,GeneD,spc1,spc2
GeneD,GeneE,spc4,spc2
GeneE,GeneF,spc3,spc1
GeneX,GeneY,spc6,spc8
GeneX,GeneP,spc6,spc7

My desired Output is
Gene1,Gene2,Gene3,Gene4,spc1,spc2,spc1,spc2,spc4,spc1,spc2,spc5,spc1,spc2,spc2,spc3,spc1,spc2
GeneA,GeneB,GeneC,GeneD,GeneE,GeneF,spc4,spc5,spc1,spc2,spc1,spc2,spc4,spc2,spc3,spc1
GeneX,GeneY,GeneP,,spc6,spc8,spc6,spc7


Currently I am working only on the first half problem .All I am trying to do is get the Gene"X" to cluster.



#!/usr/bin/perl use strict; use warnings; my $file = $ARGV[0]; my %GFs = get_gene_families($file); foreach my $key (keys(%GFs)){ print $key.$GFs{$key}."\n"; } exit; sub get_gene_families{ my $fileName = $_[0]; my %hash = (); open(IFILE, "$fileName") or die "Couldn't open: $fileName\n"; while(my $line = <IFILE>){ my @genes = split(/\,/, $line); my $new = 1; foreach my $key (keys(%hash)){ my @split = split(/\,/, $hash{$key}); push(@split, $key); if(contains($genes[0], \@split) && contains($genes[1], \@split)){ $new = 0; } if(contains($genes[0], \@split) && !contains($genes[1], \@split)){ $hash{$key} .= ",".$genes[1]; $new = 0; } if(!contains($genes[0], \@split) && contains($genes[1], \@split)){ $hash{$key} .= ",".$genes[0]; $new = 0; } } if($new){ $hash{$genes[0]} .= ",".$genes[1]; } } close IFILE; return %hash; } sub contains{ my $target = $_[0]; my @array = @{$_[1]}; foreach my $element (@array){ if($element eq $target){ return 1; } else } return 0; }
But the output I am getting is

Gene1,Gene2,Gene3,Gene4
GeneB,GeneC
Gene2,Gene3,Gene4
GeneE,GeneF
GeneD,GeneE

,
GeneC,GeneD
GeneA,GeneB
Gene3,Gene4
GeneX,GeneY,GeneP



CAN ANY BODY PLEASE HELP I have been stuck with this problem for a over two weeks now. And I have not yet managed to deal with the first half of the problem.

Replies are listed 'Best First'.
Re: Clustering with Perl
by BrowserUk (Patriarch) on Jun 24, 2009 at 14:25 UTC

    Does this look right? (You may need to sort or uniq the gene names).

    C:\test>774383.pl Gene1:Gene2 tags:: Group both under new tag: A Gene3:Gene1 tags::A Group first with second under A Gene4:Gene1 tags::A Group first with second under A Gene2:Gene3 tags:A:A Both grouped; add spcs Gene2:Gene4 tags:A:A Both grouped; add spcs Gene3:Gene4 tags:A:A Both grouped; add spcs GeneA:GeneB tags:: Group both under new tag: B GeneB:GeneC tags:B: Group second with first under B GeneC:GeneD tags:B: Group second with first under B GeneD:GeneE tags:B: Group second with first under B GeneE:GeneF tags:B: Group second with first under B GeneX:GeneY tags:: Group both under new tag: C GeneX:GeneP tags:C: Group second with first under C { A => { genes => ["Gene1", "Gene2", "Gene3", "Gene4"], spcs => [ "spc1", "spc2", "spc1", "spc2", "spc4", "spc1", "spc2", " +spc5", "spc3", "spc1", "spc1", "spc2", "spc2", "spc3", "spc1", " +spc2", ], }, B => { genes => ["GeneA", "GeneB", "GeneC", "GeneD", "GeneE", "GeneF +"], spcs => [ "spc4", "spc5", "spc1", "spc2", "spc1", "spc2", "spc4", "spc2", "spc3", "spc1", ], }, C => { genes => ["GeneX", "GeneY", "GeneP"], spcs => ["spc6", "spc8", "spc6", "spc7"], }, }

    The code:

    #! perl -s use 5.010; use strict; use Data::Dump qw[ pp ]; =comment My desired Output is Gene1,Gene2,Gene3,Gene4,spc1,spc2,spc1,spc2,spc4,spc1,spc2,spc5,spc1,s +pc2,spc2,spc3,spc1,spc2 GeneA,GeneB,GeneC,GeneD,GeneE,GeneF,spc4,spc5,spc1,spc2,spc1,spc2,spc4 +,spc2,spc3,spc1 GeneX,GeneY,GeneP,,spc6,spc8,spc6,spc7 =cut my( $nextTag, %gTags, %groups ) = 'A'; while( <DATA> ) { chomp; my( $g1, $g2, @spcs ) = split ','; say "$g1:$g2 tags:$gTags{ $g1 }:$gTags{$g2 }"; <>; if( exists $gTags{ $g1 } and exists $gTags{ $g2 } ){ if( $gTags{ $g1 } eq $gTags{ $g2 } ) { say "Both grouped; add spcs"; ## Just add new spcs to existing push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @spcs; } else { ## merge groups say "merge $gTags{ $g1 }:$gTags{$g2 }"; ## Add the current info to the first gene group. push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @spcs; push @{ $groups{ $gTags{ $g1 } }{ genes } }, $g2; ## Move the info from the seconds gene group to the first push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @{ $groups{ $gTags{ $g2 } }{ spcs } }; push @{ $groups{ $gTags{ $g1 } }{ genes } }, @{ $groups{ $gTags{ $g2 } }{ genes } }; ## delete the second gene group $groups{ $gTags{ $g2 } } = undef; ## And re-tag it to be the first $gTags{ $g2 } = $gTags{ $g1 }; } } elsif( exists $gTags{ $g1 } and not exists $gTags{ $g2 } ) { say "Group second with first under $gTags{ $g1 }"; $gTags{ $g2 } = $gTags{ $g1 }; push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @spcs; push @{ $groups{ $gTags{ $g1 } }{ genes } }, $g2; } elsif( exists $gTags{ $g2 } and not exists $gTags{ $g1 } ) { say "Group first with second under $gTags{ $g2 }"; $gTags{ $g1 } = $gTags{ $g2 }; push @{ $groups{ $gTags{ $g2 } }{ spcs } }, @spcs; push @{ $groups{ $gTags{ $g2 } }{ genes } }, $g1; } else { ## neither yet exist so group them under a new tag my $tag = $nextTag++; say "Group both under new tag: $tag"; $gTags{ $g1 } = $gTags{ $g2 } = $tag; push @{ $groups{ $tag }{ spcs } }, @spcs; push @{ $groups{ $tag }{ genes } }, $g1, $g2; } } pp \%groups; __DATA__ Gene1,Gene2,spc1,spc2 Gene3,Gene1,spc1,spc2,spc4 Gene4,Gene1,spc1,spc2,spc5,spc3,spc1 Gene2,Gene3,spc1,spc2 Gene2,Gene4,spc2,spc3 Gene3,Gene4,spc1,spc2 GeneA,GeneB,spc4,spc5 GeneB,GeneC,spc1,spc2 GeneC,GeneD,spc1,spc2 GeneD,GeneE,spc4,spc2 GeneE,GeneF,spc3,spc1 GeneX,GeneY,spc6,spc8 GeneX,GeneP,spc6,spc7

    Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
    "Science is about questioning the status quo. Questioning authority".
    In the absence of evidence, opinion is indistinguishable from prejudice.
      What's the reason behind this stuff?
      #! perl -s use 5.010; use strict;
      "-s" adds some command-line parsing you're not using, and "use 5.010" breaks backward compatibility and (maybe?) enables some features you're not using. IIRC, the standard ritual chant around these parts is
      #!/usr/bin/env perl use strict; use warnings; use diagnostics;
      The particularly devout may use
      #!/usr/bin/env perl use Modern::Perl; use Moose; use diagnostics;
      The purpose of the repetition is to get people to write things without understanding them, so making up your own chant is confusing and counterproductive.
        What's the reason behind this stuff? ... The purpose of the repetition is to get people to write things without understanding them

        Because that's what I use on my system, because I DO UNDERSTAND what each of them does, and I use all of them frequently enough that I consider them standard.

        And because I don't believe in trying to brainwash the world with "ritual chants" or cargo cult code.


        Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
        "Science is about questioning the status quo. Questioning authority".
        In the absence of evidence, opinion is indistinguishable from prejudice.
Re: Clustering with Perl
by Transient (Hermit) on Jun 24, 2009 at 15:38 UTC
    This is what I used. More of an older style I'd imagine, and probably could be cleaned up a bit, but it works (I think!):
    #!/usr/bin/perl use strict; use warnings; use Data::Dumper; # read in data my $g_arr = [ ]; my $value_arr = [ ]; while ( <DATA> ) { chomp; # Assumes 2 genes per line in the first and second position! my @line = split /,/, $_, 3; $line[2] =~ s/\s*$//; my $cluster_idx = 0; my @found = ( -1, -1 ); foreach my $cluster ( @$g_arr ) { foreach my $poss_g_match ( @$cluster ) { if ( $line[0] eq $poss_g_match ) { $found[0] = 1; } if ( $line[1] eq $poss_g_match ) { $found[1] = 1; } } last if ( $found[0] != -1 || $found[1] != -1 ); $cluster_idx++; } if ( $found[0] == -1 ) { push @{$g_arr->[$cluster_idx]}, $line[0]; } if ( $found[1] == -1 ) { push @{$g_arr->[$cluster_idx]}, $line[1]; } push @{$value_arr->[$cluster_idx]}, $line[2]; } my $traversal_idx = 0; foreach my $cluster ( @$g_arr ) { print join( ',', @$cluster) . " : " . join( ',', @{$value_arr->[$tra +versal_idx++]} ), "\n"; } __DATA__ Gene1,Gene2,spc1,spc2 Gene3,Gene1,spc1,spc2,spc4 Gene4,Gene1,spc1,spc2,spc5,spc3,spc1 Gene2,Gene3,spc1,spc2 Gene2,Gene4,spc2,spc3 Gene3,Gene4,spc1,spc2 GeneA,GeneB,spc4,spc5 GeneB,GeneC,spc1,spc2 GeneC,GeneD,spc1,spc2 GeneD,GeneE,spc4,spc2 GeneE,GeneF,spc3,spc1 GeneX,GeneY,spc6,spc8 GeneX,GeneP,spc6,spc7 GeneUnknown.,GeneUnknown.,spc1,spc2
    Outputs
    Gene1,Gene2,Gene3,Gene4 : spc1,spc2,spc1,spc2,spc4,spc1,spc2,spc5,spc3 +,spc1,spc1,spc2,spc2,spc3,spc1,spc2 GeneA,GeneB,GeneC,GeneD,GeneE,GeneF : spc4,spc5,spc1,spc2,spc1,spc2,sp +c4,spc2,spc3,spc1 GeneX,GeneY,GeneP : spc6,spc8,spc6,spc7 GeneUnknown.,GeneUnknown. : spc1,spc2
    Update: Added comment
      Hi there,

      Your code athough it has a bug or a error on my side providing incomplete details.

      My sample input was
      Gene1,Gene2,spc1,spc2
      Gene3,Gene1,spc1,spc2,spc4
      Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
      etc

      And I got the correct results.

      But the sample can contains entries like the below the number of "gene" is variable can be less or be more

      GeneX,GeneY,GeneP,spc1,spc2
      GeneY,spc3,spc4

      The desired result would be
      GeneX,GeneY,GeneP,spc1,spc2,spc3,spc4


      But the results which the script gives is
      GeneX,GeneY,spc3,GeneP,spc1,spc2,spc4
        That's correct - my code assumes two genes per line in the first and second position. I had actually included a comment in my original attempt at solving the problem, but somehow left it out in my post =)

        Update: Can it be assumed then, that a gene starts with "Gene" and the values start with "spc"?
Re: Clustering with Perl
by dwm042 (Priest) on Jun 24, 2009 at 16:15 UTC
    This code will cluster your "genes".. I do not know how far it goes to allow you to solve the second part of your problem.

    #!/usr/bin/perl use warnings; use strict; my %adjacent; while (<DATA>) { my @component = split /\,/, $_, 3; my $parent; if ( defined($adjacent{$component[0]})) { $adjacent{$component[0]}{$component[1]} = $component[2]; } elsif ( defined($adjacent{$component[1]})) { $adjacent{$component[1]}{$component[0]} = $component[2]; } elsif ( $parent = has_parent($component[0]) ) { $adjacent{$parent}{$component[1]} = $component[2]; } elsif ( $parent = has_parent($component[1]) ) { $adjacent{$parent}{$component[0]} = $component[2]; } else { $adjacent{$component[0]}{$component[1]} = $component[2]; } } print "\n\n*----------------------------------------------*\n"; for my $inner ( sort keys %adjacent ) { print "$inner "; for my $outer ( sort keys %{$adjacent{$inner}} ) { print ", $outer"; } print "\n"; } sub has_parent { my $candidate = shift; for my $inner ( sort keys %adjacent ) { for my $outer ( sort keys %{$adjacent{$inner}} ) { if ( $candidate eq $outer ) { return $inner; } } } return 0; } __DATA__ Gene1,Gene2,spc1,spc2 Gene3,Gene1,spc1,spc2,spc4 Gene4,Gene1,spc1,spc2,spc5,spc3,spc1 Gene2,Gene3,spc1,spc2 Gene2,Gene4,spc2,spc3 Gene3,Gene4,spc1,spc2 GeneA,GeneB,spc4,spc5 GeneB,GeneC,spc1,spc2 GeneC,GeneD,spc1,spc2 GeneD,GeneE,spc4,spc2 GeneE,GeneF,spc3,spc1 GeneX,GeneY,spc6,spc8 GeneX,GeneP,spc6,spc7
    And the results are:

    C:\Code>perl adjacent.pl *----------------------------------------------* Gene1 , Gene2, Gene3, Gene4 GeneA , GeneB, GeneC, GeneD, GeneE, GeneF GeneX , GeneP, GeneY
    Update: newer code solves it completely.

    #!/usr/bin/perl use warnings; use strict; # # %adjacent - each key contains a cluster identified by the key (root +). # %sequences - contains sequences of "spcs" for each cluster. # my %adjacent; my %sequences; while (<DATA>) { chomp; my @component = split /\,/, $_, 3; my $parent; $component[2] =~ s/\s+$//; if ( $parent = has_root($component[0]) ) { push @{$adjacent{$parent}}, $component[1] unless ( has_root($component[1])); push @{$sequences{$parent}}, $component[2]; } elsif ( $parent = has_root($component[1]) ) { push @{$adjacent{$parent}}, $component[0] unless ( has_root($component[0])); push @{$sequences{$parent}}, $component[2]; } else { push @{$adjacent{$component[0]}}, $component[1]; push @{$sequences{$component[0]}}, $component[2]; } } print "*----------------------------------------------*\n"; for my $inner ( sort keys %adjacent ) { print "$inner"; for my $outer ( sort @{$adjacent{$inner}} ) { print ",$outer"; } for (@{$sequences{$inner}}) { print ",$_"; } print "\n"; } sub has_root { my $candidate = shift; return 0 if ( keys %adjacent < 1); for my $inner ( sort keys %adjacent ) { if ( $candidate eq $inner ) { return $inner; } for my $outer ( sort @{$adjacent{$inner}} ) { if ( $candidate eq $outer ) { return $inner; } } } return 0; } __DATA__ Gene1,Gene2,spc1,spc2 Gene3,Gene1,spc1,spc2,spc4 Gene4,Gene1,spc1,spc2,spc5,spc3,spc1 Gene2,Gene3,spc1,spc2 Gene2,Gene4,spc2,spc3 Gene3,Gene4,spc1,spc2 GeneA,GeneB,spc4,spc5 GeneB,GeneC,spc1,spc2 GeneC,GeneD,spc1,spc2 GeneD,GeneE,spc4,spc2 GeneE,GeneF,spc3,spc1 GeneX,GeneY,spc6,spc8 GeneX,GeneP,spc6,spc7

      The gene number can basically vary.
      you can have <2 genes
      Hi there,
      Your code worked like a charm although it has a bug or a error on my side providing incomplete details.
      My sample input was

      Gene1,Gene2,spc1,spc2
      Gene3,Gene1,spc1,spc2,spc4
      Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
      etc

      And I got the correct results.


      But the sample can contains entries like the bellow

      GeneX,GeneY,GeneP,spc1,spc2
      GeneY,spc3,spc4

      The desired result would be
      GeneX,GeneY,GeneP,spc1,spc2,spc3,spc4


      But the results which the script gives is
      GeneX,GeneY,spc3,GeneP,spc1,spc2,spc4

      *************************************************************

      Also how can i modify the cureent code to give me aoutput like this Sample input

      GeneX,GeneY,GeneP,spc1,spc2
      GeneY,spc3,spc4
      GeneZ,spc3,spc4

      Desired Result
      GeneX,GeneY,GeneP,GeneZ,spc1=1,spc2=1,spc3=2,spc4=2

      I want to count the number of \/ "spc" for a given "spc" /\ so that i can later confirm the output. because I have a 25 mb txt file to process which will probably take hour or more.
        nerve,

        Looking at the things you want done, I don't see how code can tell that

        GeneX,GeneY,GeneP,spc1,spc2 GeneY,spc3,spc4 GeneZ,spc3,spc4
        Should lead to:
        Desired Result GeneX,GeneY,GeneP,GeneZ,spc1=1,spc2=1,spc3=2,spc4=2
        What in the data set "says" that GeneZ is to be clustered with X, Y, and P? Nothing that I can determine. The counts (e.g. spc1=1, etc) would be easy to do and the code I presented creates a stack of spc results. Just pop results off the stack, and count instances.

        David.
        This is as close as I can come, given that the description of how to process data is inconsistent. The trick here is to partition the split data set into an array of genes and an array of spcs and work with that. I'm sure others here can make this prettier code than I.

        If you want it spaced nicely, please run a copy of perltidy on this code.

        #!/usr/bin/perl use warnings; use strict; # # %adjacent - is a hash of arrays. # each key contains a cluster identified by the key (root +). # %sequences - is a hash of hashes. # The first key is identical to a key (root) in %adjacent +. # The second key ids each "spcs" in the cluster. # The value is the count of "spc". # my %adjacent; my %sequences; while (<DATA>) { chomp; next unless $_ =~ /\w+/; my @component = split /\,/, $_; my @genes; my @spcs; for my $element (@component) { $element =~ s/\s+$//; if ( $element =~ /^Gene/i ) { push @genes, $element; } else { push @spcs, $element; } } if ( my $parent = rooted( \@genes ) ) { for my $gene (@genes) { push @{$adjacent{$parent}}, $gene unless ( has_root($gene)); } for my $species (@spcs) { $sequences{$parent}{$species}++; } } else { my $root = shift @genes; @{$adjacent{$root}} = (); for my $gene ( @genes ) { push @{$adjacent{$root}}, $gene unless ( has_root($gene)); } for my $species (@spcs) { $sequences{$parent}{$species}++; } } } print "*----------------------------------------------*\n"; for my $inner ( sort keys %adjacent ) { print "$inner"; for my $outer ( sort @{$adjacent{$inner}} ) { print ",$outer"; } for (sort keys %{$sequences{$inner}}) { printf ",%s=%d", $_, $sequences{$inner}{$_}; } print "\n"; } sub has_root { my $candidate = shift; return 0 if ( keys %adjacent < 1); for my $inner ( sort keys %adjacent ) { if ( $candidate eq $inner ) { return $inner; } for my $outer ( sort @{$adjacent{$inner}} ) { if ( $candidate eq $outer ) { return $inner; } } } return 0; } sub rooted { my $aref = shift; for (@$aref) { my $parent = has_root( $_ ); return $parent if $parent; } return 0; } __DATA__ Gene1,Gene2,spc1,spc2 Gene3,Gene1,spc1,spc2,spc4 Gene4,Gene1,spc1,spc2,spc5,spc3,spc1 Gene2,Gene3,spc1,spc2 Gene2,Gene4,spc2,spc3 Gene3,Gene4,spc1,spc2 GeneA,GeneB,spc4,spc5 GeneB,GeneC,spc1,spc2 GeneC,GeneD,spc1,spc2 GeneD,GeneE,spc4,spc2 GeneE,GeneF,spc3,spc1 GeneX,GeneY,spc6,spc8 GeneX,GeneY,GeneP,spc1,spc2 GeneY,spc3,spc4
        With the results:
        C:\Code>perl adjacent.pl *----------------------------------------------* Gene1,Gene2,Gene3,Gene4,spc1=5,spc2=5,spc3=2,spc4=1,spc5=1 GeneA,GeneB,GeneC,GeneD,GeneE,GeneF,spc1=3,spc2=3,spc3=1,spc4=1 GeneX,GeneP,GeneY,spc1=1,spc2=1,spc3=1,spc4=1