# Always use strictures! use strict; use warnings; # Use nice names for variables. There is no such thing as saving time # using short variable names! However, don't use excessively long # names either. Naming things is an important part of programming! my %idLookup; my %firstLines; while () { chomp; # not chop! Remove line end from $_ next unless length; # Skip blank lines my $pair = $_; # Pull out the id strings my ($first, $second) = map {s/.*\.//; $_} split ' '; # idLookup is a hash of arrays. Push one id onto the array # accessed using the other id as the hash key push @{$idLookup{$first}}, [$second, $pair]; push @{$idLookup{$second}}, [$first, $pair]; # Build a hash keyed by id containing line numbers ($. is current line number $firstLines{$first} = $. unless exists $firstLines{$first}; $firstLines{$second} = $. + 0.5 unless exists $firstLines{$second}; } # Build a list of ids ordered by line number my @lineOrder = sort {$firstLines{$a} <=> $firstLines{$b}} keys %firstLines; my $clusterNum; # Generate clusters in order of first occurrence of ids for my $id (@lineOrder) { # next if already allocated id to a cluster next unless exists $firstLines{$id}; my %hits = ($id => undef); # hash of ids in current cluster to process my @cluster; while (keys %hits) { my $hit = (keys %hits)[0]; # Get an id to process my $list = $idLookup{$hit}; # Get list of other ids current one is matched with delete $hits{$hit}; # Remove current id from the hit list # next if already processed the id next unless exists $firstLines{$hit}; delete $firstLines{$hit}; # Remove current id to indicate it's processed for my $entry (@$list) { # Add matched ids to cluster my ($entryId, $pair) = @$entry; # next (don't add) if processed already next unless exists $firstLines{$entryId}; push @cluster, $pair; # Add the original line to the cluster $hits{$entryId} = 1; # Add the matched id as a hit } } # print the cluster print "cluster", ++$clusterNum, "\n"; # Header line print join "\n", @cluster, "\n"; # Original lines }