#!/usr/bin/perl use strict; use warnings; use Data::Dumper; # read in data my $g_arr = [ ]; my $value_arr = [ ]; while ( ) { chomp; # Assumes 2 genes per line in the first and second position! my @line = split /,/, $_, 3; $line[2] =~ s/\s*$//; my $cluster_idx = 0; my @found = ( -1, -1 ); foreach my $cluster ( @$g_arr ) { foreach my $poss_g_match ( @$cluster ) { if ( $line[0] eq $poss_g_match ) { $found[0] = 1; } if ( $line[1] eq $poss_g_match ) { $found[1] = 1; } } last if ( $found[0] != -1 || $found[1] != -1 ); $cluster_idx++; } if ( $found[0] == -1 ) { push @{$g_arr->[$cluster_idx]}, $line[0]; } if ( $found[1] == -1 ) { push @{$g_arr->[$cluster_idx]}, $line[1]; } push @{$value_arr->[$cluster_idx]}, $line[2]; } my $traversal_idx = 0; foreach my $cluster ( @$g_arr ) { print join( ',', @$cluster) . " : " . join( ',', @{$value_arr->[$traversal_idx++]} ), "\n"; } __DATA__ Gene1,Gene2,spc1,spc2 Gene3,Gene1,spc1,spc2,spc4 Gene4,Gene1,spc1,spc2,spc5,spc3,spc1 Gene2,Gene3,spc1,spc2 Gene2,Gene4,spc2,spc3 Gene3,Gene4,spc1,spc2 GeneA,GeneB,spc4,spc5 GeneB,GeneC,spc1,spc2 GeneC,GeneD,spc1,spc2 GeneD,GeneE,spc4,spc2 GeneE,GeneF,spc3,spc1 GeneX,GeneY,spc6,spc8 GeneX,GeneP,spc6,spc7 GeneUnknown.,GeneUnknown.,spc1,spc2