#!/usr/bin/perl use warnings; use strict; # # %adjacent - is a hash of arrays. # each key contains a cluster identified by the key (root). # %sequences - is a hash of hashes. # The first key is identical to a key (root) in %adjacent. # The second key ids each "spcs" in the cluster. # The value is the count of "spc". # my %adjacent; my %sequences; while () { chomp; next unless $_ =~ /\w+/; my @component = split /\,/, $_; my @genes; my @spcs; for my $element (@component) { $element =~ s/\s+$//; if ( $element =~ /^Gene/i ) { push @genes, $element; } else { push @spcs, $element; } } if ( my $parent = rooted( \@genes ) ) { for my $gene (@genes) { push @{$adjacent{$parent}}, $gene unless ( has_root($gene)); } for my $species (@spcs) { $sequences{$parent}{$species}++; } } else { my $root = shift @genes; @{$adjacent{$root}} = (); for my $gene ( @genes ) { push @{$adjacent{$root}}, $gene unless ( has_root($gene)); } for my $species (@spcs) { $sequences{$parent}{$species}++; } } } print "*----------------------------------------------*\n"; for my $inner ( sort keys %adjacent ) { print "$inner"; for my $outer ( sort @{$adjacent{$inner}} ) { print ",$outer"; } for (sort keys %{$sequences{$inner}}) { printf ",%s=%d", $_, $sequences{$inner}{$_}; } print "\n"; } sub has_root { my $candidate = shift; return 0 if ( keys %adjacent < 1); for my $inner ( sort keys %adjacent ) { if ( $candidate eq $inner ) { return $inner; } for my $outer ( sort @{$adjacent{$inner}} ) { if ( $candidate eq $outer ) { return $inner; } } } return 0; } sub rooted { my $aref = shift; for (@$aref) { my $parent = has_root( $_ ); return $parent if $parent; } return 0; } __DATA__ Gene1,Gene2,spc1,spc2 Gene3,Gene1,spc1,spc2,spc4 Gene4,Gene1,spc1,spc2,spc5,spc3,spc1 Gene2,Gene3,spc1,spc2 Gene2,Gene4,spc2,spc3 Gene3,Gene4,spc1,spc2 GeneA,GeneB,spc4,spc5 GeneB,GeneC,spc1,spc2 GeneC,GeneD,spc1,spc2 GeneD,GeneE,spc4,spc2 GeneE,GeneF,spc3,spc1 GeneX,GeneY,spc6,spc8 GeneX,GeneY,GeneP,spc1,spc2 GeneY,spc3,spc4 #### C:\Code>perl adjacent.pl *----------------------------------------------* Gene1,Gene2,Gene3,Gene4,spc1=5,spc2=5,spc3=2,spc4=1,spc5=1 GeneA,GeneB,GeneC,GeneD,GeneE,GeneF,spc1=3,spc2=3,spc3=1,spc4=1 GeneX,GeneP,GeneY,spc1=1,spc2=1,spc3=1,spc4=1