Re: Clustering with Perl
by BrowserUk (Patriarch) on Jun 24, 2009 at 14:25 UTC
|
C:\test>774383.pl
Gene1:Gene2 tags::
Group both under new tag: A
Gene3:Gene1 tags::A
Group first with second under A
Gene4:Gene1 tags::A
Group first with second under A
Gene2:Gene3 tags:A:A
Both grouped; add spcs
Gene2:Gene4 tags:A:A
Both grouped; add spcs
Gene3:Gene4 tags:A:A
Both grouped; add spcs
GeneA:GeneB tags::
Group both under new tag: B
GeneB:GeneC tags:B:
Group second with first under B
GeneC:GeneD tags:B:
Group second with first under B
GeneD:GeneE tags:B:
Group second with first under B
GeneE:GeneF tags:B:
Group second with first under B
GeneX:GeneY tags::
Group both under new tag: C
GeneX:GeneP tags:C:
Group second with first under C
{
A => {
genes => ["Gene1", "Gene2", "Gene3", "Gene4"],
spcs => [
"spc1", "spc2", "spc1", "spc2", "spc4", "spc1", "spc2", "
+spc5",
"spc3", "spc1", "spc1", "spc2", "spc2", "spc3", "spc1", "
+spc2",
],
},
B => {
genes => ["GeneA", "GeneB", "GeneC", "GeneD", "GeneE", "GeneF
+"],
spcs => [
"spc4", "spc5", "spc1", "spc2", "spc1",
"spc2", "spc4", "spc2", "spc3", "spc1",
],
},
C => {
genes => ["GeneX", "GeneY", "GeneP"],
spcs => ["spc6", "spc8", "spc6", "spc7"],
},
}
The code:
#! perl -s
use 5.010;
use strict;
use Data::Dump qw[ pp ];
=comment
My desired Output is
Gene1,Gene2,Gene3,Gene4,spc1,spc2,spc1,spc2,spc4,spc1,spc2,spc5,spc1,s
+pc2,spc2,spc3,spc1,spc2
GeneA,GeneB,GeneC,GeneD,GeneE,GeneF,spc4,spc5,spc1,spc2,spc1,spc2,spc4
+,spc2,spc3,spc1
GeneX,GeneY,GeneP,,spc6,spc8,spc6,spc7
=cut
my( $nextTag, %gTags, %groups ) = 'A';
while( <DATA> ) {
chomp;
my( $g1, $g2, @spcs ) = split ',';
say "$g1:$g2 tags:$gTags{ $g1 }:$gTags{$g2 }"; <>;
if( exists $gTags{ $g1 } and exists $gTags{ $g2 } ){
if( $gTags{ $g1 } eq $gTags{ $g2 } ) {
say "Both grouped; add spcs";
## Just add new spcs to existing
push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @spcs;
}
else { ## merge groups
say "merge $gTags{ $g1 }:$gTags{$g2 }";
## Add the current info to the first gene group.
push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @spcs;
push @{ $groups{ $gTags{ $g1 } }{ genes } }, $g2;
## Move the info from the seconds gene group to the first
push @{ $groups{ $gTags{ $g1 } }{ spcs } },
@{ $groups{ $gTags{ $g2 } }{ spcs } };
push @{ $groups{ $gTags{ $g1 } }{ genes } },
@{ $groups{ $gTags{ $g2 } }{ genes } };
## delete the second gene group
$groups{ $gTags{ $g2 } } = undef;
## And re-tag it to be the first
$gTags{ $g2 } = $gTags{ $g1 };
}
}
elsif( exists $gTags{ $g1 } and not exists $gTags{ $g2 } ) {
say "Group second with first under $gTags{ $g1 }";
$gTags{ $g2 } = $gTags{ $g1 };
push @{ $groups{ $gTags{ $g1 } }{ spcs } }, @spcs;
push @{ $groups{ $gTags{ $g1 } }{ genes } }, $g2;
}
elsif( exists $gTags{ $g2 } and not exists $gTags{ $g1 } ) {
say "Group first with second under $gTags{ $g2 }";
$gTags{ $g1 } = $gTags{ $g2 };
push @{ $groups{ $gTags{ $g2 } }{ spcs } }, @spcs;
push @{ $groups{ $gTags{ $g2 } }{ genes } }, $g1;
}
else { ## neither yet exist so group them under a new tag
my $tag = $nextTag++;
say "Group both under new tag: $tag";
$gTags{ $g1 } = $gTags{ $g2 } = $tag;
push @{ $groups{ $tag }{ spcs } }, @spcs;
push @{ $groups{ $tag }{ genes } }, $g1, $g2;
}
}
pp \%groups;
__DATA__
Gene1,Gene2,spc1,spc2
Gene3,Gene1,spc1,spc2,spc4
Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
Gene2,Gene3,spc1,spc2
Gene2,Gene4,spc2,spc3
Gene3,Gene4,spc1,spc2
GeneA,GeneB,spc4,spc5
GeneB,GeneC,spc1,spc2
GeneC,GeneD,spc1,spc2
GeneD,GeneE,spc4,spc2
GeneE,GeneF,spc3,spc1
GeneX,GeneY,spc6,spc8
GeneX,GeneP,spc6,spc7
-
-
Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.
| [reply] [d/l] [select] |
|
|
What's the reason behind this stuff?
#! perl -s
use 5.010;
use strict;
"-s" adds some command-line parsing you're not using, and "use 5.010" breaks backward compatibility and (maybe?) enables some features you're not using. IIRC, the standard ritual chant around these parts is
#!/usr/bin/env perl
use strict;
use warnings;
use diagnostics;
The particularly devout may use
#!/usr/bin/env perl
use Modern::Perl;
use Moose;
use diagnostics;
The purpose of the repetition is to get people to write things without understanding them, so making up your own chant is confusing and counterproductive. | [reply] [d/l] [select] |
|
|
What's the reason behind this stuff? ... The purpose of the repetition is to get people to write things without understanding them
Because that's what I use on my system, because I DO UNDERSTAND what each of them does, and I use all of them frequently enough that I consider them standard.
And because I don't believe in trying to brainwash the world with "ritual chants" or cargo cult code.
Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.
| [reply] |
Re: Clustering with Perl
by Transient (Hermit) on Jun 24, 2009 at 15:38 UTC
|
This is what I used. More of an older style I'd imagine, and probably could be cleaned up a bit, but it works (I think!):
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
# read in data
my $g_arr = [ ];
my $value_arr = [ ];
while ( <DATA> ) {
chomp;
# Assumes 2 genes per line in the first and second position!
my @line = split /,/, $_, 3;
$line[2] =~ s/\s*$//;
my $cluster_idx = 0;
my @found = ( -1, -1 );
foreach my $cluster ( @$g_arr ) {
foreach my $poss_g_match ( @$cluster ) {
if ( $line[0] eq $poss_g_match ) {
$found[0] = 1;
}
if ( $line[1] eq $poss_g_match ) {
$found[1] = 1;
}
}
last if ( $found[0] != -1 || $found[1] != -1 );
$cluster_idx++;
}
if ( $found[0] == -1 ) {
push @{$g_arr->[$cluster_idx]}, $line[0];
}
if ( $found[1] == -1 ) {
push @{$g_arr->[$cluster_idx]}, $line[1];
}
push @{$value_arr->[$cluster_idx]}, $line[2];
}
my $traversal_idx = 0;
foreach my $cluster ( @$g_arr ) {
print join( ',', @$cluster) . " : " . join( ',', @{$value_arr->[$tra
+versal_idx++]} ), "\n";
}
__DATA__
Gene1,Gene2,spc1,spc2
Gene3,Gene1,spc1,spc2,spc4
Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
Gene2,Gene3,spc1,spc2
Gene2,Gene4,spc2,spc3
Gene3,Gene4,spc1,spc2
GeneA,GeneB,spc4,spc5
GeneB,GeneC,spc1,spc2
GeneC,GeneD,spc1,spc2
GeneD,GeneE,spc4,spc2
GeneE,GeneF,spc3,spc1
GeneX,GeneY,spc6,spc8
GeneX,GeneP,spc6,spc7
GeneUnknown.,GeneUnknown.,spc1,spc2
Outputs
Gene1,Gene2,Gene3,Gene4 : spc1,spc2,spc1,spc2,spc4,spc1,spc2,spc5,spc3
+,spc1,spc1,spc2,spc2,spc3,spc1,spc2
GeneA,GeneB,GeneC,GeneD,GeneE,GeneF : spc4,spc5,spc1,spc2,spc1,spc2,sp
+c4,spc2,spc3,spc1
GeneX,GeneY,GeneP : spc6,spc8,spc6,spc7
GeneUnknown.,GeneUnknown. : spc1,spc2
Update: Added comment
| [reply] [d/l] [select] |
|
|
Hi there,
Your code athough it has a bug or a error on my side providing incomplete details.
My sample input was
Gene1,Gene2,spc1,spc2
Gene3,Gene1,spc1,spc2,spc4
Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
etc
And I got the correct results.
But the sample can contains entries like the below the number of "gene" is variable can be less or be more
GeneX,GeneY,GeneP,spc1,spc2
GeneY,spc3,spc4
The desired result would be
GeneX,GeneY,GeneP,spc1,spc2,spc3,spc4
But the results which the script gives is
GeneX,GeneY,spc3,GeneP,spc1,spc2,spc4
| [reply] |
|
|
That's correct - my code assumes two genes per line in the first and second position. I had actually included a comment in my original attempt at solving the problem, but somehow left it out in my post =)
Update: Can it be assumed then, that a gene starts with "Gene" and the values start with "spc"?
| [reply] |
Re: Clustering with Perl
by dwm042 (Priest) on Jun 24, 2009 at 16:15 UTC
|
This code will cluster your "genes".. I do not know how far it goes to allow you to solve the second part of your problem.
#!/usr/bin/perl
use warnings;
use strict;
my %adjacent;
while (<DATA>) {
my @component = split /\,/, $_, 3;
my $parent;
if ( defined($adjacent{$component[0]})) {
$adjacent{$component[0]}{$component[1]} = $component[2];
}
elsif ( defined($adjacent{$component[1]})) {
$adjacent{$component[1]}{$component[0]} = $component[2];
}
elsif ( $parent = has_parent($component[0]) ) {
$adjacent{$parent}{$component[1]} = $component[2];
}
elsif ( $parent = has_parent($component[1]) ) {
$adjacent{$parent}{$component[0]} = $component[2];
}
else {
$adjacent{$component[0]}{$component[1]} = $component[2];
}
}
print "\n\n*----------------------------------------------*\n";
for my $inner ( sort keys %adjacent ) {
print "$inner ";
for my $outer ( sort keys %{$adjacent{$inner}} ) {
print ", $outer";
}
print "\n";
}
sub has_parent {
my $candidate = shift;
for my $inner ( sort keys %adjacent ) {
for my $outer ( sort keys %{$adjacent{$inner}} ) {
if ( $candidate eq $outer ) {
return $inner;
}
}
}
return 0;
}
__DATA__
Gene1,Gene2,spc1,spc2
Gene3,Gene1,spc1,spc2,spc4
Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
Gene2,Gene3,spc1,spc2
Gene2,Gene4,spc2,spc3
Gene3,Gene4,spc1,spc2
GeneA,GeneB,spc4,spc5
GeneB,GeneC,spc1,spc2
GeneC,GeneD,spc1,spc2
GeneD,GeneE,spc4,spc2
GeneE,GeneF,spc3,spc1
GeneX,GeneY,spc6,spc8
GeneX,GeneP,spc6,spc7
And the results are:
C:\Code>perl adjacent.pl
*----------------------------------------------*
Gene1 , Gene2, Gene3, Gene4
GeneA , GeneB, GeneC, GeneD, GeneE, GeneF
GeneX , GeneP, GeneY
Update: newer code solves it completely.
#!/usr/bin/perl
use warnings;
use strict;
#
# %adjacent - each key contains a cluster identified by the key (root
+).
# %sequences - contains sequences of "spcs" for each cluster.
#
my %adjacent;
my %sequences;
while (<DATA>) {
chomp;
my @component = split /\,/, $_, 3;
my $parent;
$component[2] =~ s/\s+$//;
if ( $parent = has_root($component[0]) ) {
push @{$adjacent{$parent}}, $component[1]
unless ( has_root($component[1]));
push @{$sequences{$parent}}, $component[2];
}
elsif ( $parent = has_root($component[1]) ) {
push @{$adjacent{$parent}}, $component[0]
unless ( has_root($component[0]));
push @{$sequences{$parent}}, $component[2];
}
else {
push @{$adjacent{$component[0]}}, $component[1];
push @{$sequences{$component[0]}}, $component[2];
}
}
print "*----------------------------------------------*\n";
for my $inner ( sort keys %adjacent ) {
print "$inner";
for my $outer ( sort @{$adjacent{$inner}} ) {
print ",$outer";
}
for (@{$sequences{$inner}}) {
print ",$_";
}
print "\n";
}
sub has_root {
my $candidate = shift;
return 0 if ( keys %adjacent < 1);
for my $inner ( sort keys %adjacent ) {
if ( $candidate eq $inner ) {
return $inner;
}
for my $outer ( sort @{$adjacent{$inner}} ) {
if ( $candidate eq $outer ) {
return $inner;
}
}
}
return 0;
}
__DATA__
Gene1,Gene2,spc1,spc2
Gene3,Gene1,spc1,spc2,spc4
Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
Gene2,Gene3,spc1,spc2
Gene2,Gene4,spc2,spc3
Gene3,Gene4,spc1,spc2
GeneA,GeneB,spc4,spc5
GeneB,GeneC,spc1,spc2
GeneC,GeneD,spc1,spc2
GeneD,GeneE,spc4,spc2
GeneE,GeneF,spc3,spc1
GeneX,GeneY,spc6,spc8
GeneX,GeneP,spc6,spc7
| [reply] [d/l] [select] |
|
|
The gene number can basically vary.
you can have <2 genes
Hi there,
Your code worked like a charm although it has a bug or a error on my side providing incomplete details.
My sample input was
Gene1,Gene2,spc1,spc2
Gene3,Gene1,spc1,spc2,spc4
Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
etc
And I got the correct results.
But the sample can contains entries like the bellow
GeneX,GeneY,GeneP,spc1,spc2
GeneY,spc3,spc4
The desired result would be
GeneX,GeneY,GeneP,spc1,spc2,spc3,spc4
But the results which the script gives is
GeneX,GeneY,spc3,GeneP,spc1,spc2,spc4
*************************************************************
Also how can i modify the cureent code to give me aoutput like this
Sample input
GeneX,GeneY,GeneP,spc1,spc2
GeneY,spc3,spc4
GeneZ,spc3,spc4
Desired Result
GeneX,GeneY,GeneP,GeneZ,spc1=1,spc2=1,spc3=2,spc4=2
I want to count the number of \/ "spc" for a given "spc" /\ so that i can later confirm the output. because I have a 25 mb txt file to process which will probably take hour or more.
| [reply] |
|
|
nerve,
Looking at the things you want done, I don't see how code can tell that
GeneX,GeneY,GeneP,spc1,spc2
GeneY,spc3,spc4
GeneZ,spc3,spc4
Should lead to:
Desired Result
GeneX,GeneY,GeneP,GeneZ,spc1=1,spc2=1,spc3=2,spc4=2
What in the data set "says" that GeneZ is to be clustered with X, Y, and P? Nothing that I can determine.
The counts (e.g. spc1=1, etc) would be easy to do and the code I presented creates a stack of spc results. Just pop results off the stack, and count instances.
David. | [reply] [d/l] [select] |
|
|
This is as close as I can come, given that the description of how to process data is inconsistent. The trick here is to partition the split data set into an array of genes and an array of spcs and work with that. I'm sure others here can make this prettier code than I.
If you want it spaced nicely, please run a copy of perltidy on this code.
#!/usr/bin/perl
use warnings;
use strict;
#
# %adjacent - is a hash of arrays.
# each key contains a cluster identified by the key (root
+).
# %sequences - is a hash of hashes.
# The first key is identical to a key (root) in %adjacent
+.
# The second key ids each "spcs" in the cluster.
# The value is the count of "spc".
#
my %adjacent;
my %sequences;
while (<DATA>) {
chomp;
next unless $_ =~ /\w+/;
my @component = split /\,/, $_;
my @genes;
my @spcs;
for my $element (@component) {
$element =~ s/\s+$//;
if ( $element =~ /^Gene/i ) {
push @genes, $element;
}
else {
push @spcs, $element;
}
}
if ( my $parent = rooted( \@genes ) ) {
for my $gene (@genes) {
push @{$adjacent{$parent}}, $gene
unless ( has_root($gene));
}
for my $species (@spcs) {
$sequences{$parent}{$species}++;
}
}
else {
my $root = shift @genes;
@{$adjacent{$root}} = ();
for my $gene ( @genes ) {
push @{$adjacent{$root}}, $gene
unless ( has_root($gene));
}
for my $species (@spcs) {
$sequences{$parent}{$species}++;
}
}
}
print "*----------------------------------------------*\n";
for my $inner ( sort keys %adjacent ) {
print "$inner";
for my $outer ( sort @{$adjacent{$inner}} ) {
print ",$outer";
}
for (sort keys %{$sequences{$inner}}) {
printf ",%s=%d", $_, $sequences{$inner}{$_};
}
print "\n";
}
sub has_root {
my $candidate = shift;
return 0 if ( keys %adjacent < 1);
for my $inner ( sort keys %adjacent ) {
if ( $candidate eq $inner ) {
return $inner;
}
for my $outer ( sort @{$adjacent{$inner}} ) {
if ( $candidate eq $outer ) {
return $inner;
}
}
}
return 0;
}
sub rooted {
my $aref = shift;
for (@$aref) {
my $parent = has_root( $_ );
return $parent if $parent;
}
return 0;
}
__DATA__
Gene1,Gene2,spc1,spc2
Gene3,Gene1,spc1,spc2,spc4
Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
Gene2,Gene3,spc1,spc2
Gene2,Gene4,spc2,spc3
Gene3,Gene4,spc1,spc2
GeneA,GeneB,spc4,spc5
GeneB,GeneC,spc1,spc2
GeneC,GeneD,spc1,spc2
GeneD,GeneE,spc4,spc2
GeneE,GeneF,spc3,spc1
GeneX,GeneY,spc6,spc8
GeneX,GeneY,GeneP,spc1,spc2
GeneY,spc3,spc4
With the results:
C:\Code>perl adjacent.pl
*----------------------------------------------*
Gene1,Gene2,Gene3,Gene4,spc1=5,spc2=5,spc3=2,spc4=1,spc5=1
GeneA,GeneB,GeneC,GeneD,GeneE,GeneF,spc1=3,spc2=3,spc3=1,spc4=1
GeneX,GeneP,GeneY,spc1=1,spc2=1,spc3=1,spc4=1
| [reply] [d/l] [select] |