in reply to clustering pairs
I found a somewhat different solution...
#!/usr/bin/perl use strict; use warnings; my %lookup_ids_by_line; my %lookup_lines_by_id; chomp (my @lines = <DATA>); foreach my $line (@lines) { my ($id1, $id2) = $line =~ m/\.(\S+)/g; push(@{$lookup_ids_by_line{$line}}, $id1, $id2); push(@{$lookup_lines_by_id{$id1}}, $line); push(@{$lookup_lines_by_id{$id2}}, $line); } my $cluster_number = 0; foreach my $line (@lines) { next unless(exists($lookup_ids_by_line{$line})); $cluster_number++; my @lines_in_cluster = ($line); my @ids_to_check = @{$lookup_ids_by_line{$line}}; delete($lookup_ids_by_line{$line}); while(@ids_to_check) { my $id = pop(@ids_to_check); next unless(exists($lookup_lines_by_id{$id})); foreach my $line (@{$lookup_lines_by_id{$id}}) { if(exists($lookup_ids_by_line{$line})) { push(@lines_in_cluster, $line); push(@ids_to_check, @{$lookup_ids_by_line{$line}}); delete($lookup_ids_by_line{$line}); } } delete($lookup_lines_by_id{$id}); } print "cluster number $cluster_number:\n\t" . join("\n\t",@lines_in_cluster) . "\n"; } __DATA__ ID5141.C1665 ID5141.C2448 ID5141.C1253 ID5144.C2039 ID5141.C1596 ID5144.C1956 ID5141.C1906 ID5144.C2149 ID5141.C1221 ID5144.C1956 ID5141.C2149 ID5141.C2386 ID5141.C2039 ID5142.C1221 ID5141.C5887 ID5141.C7685 ID5141.C1005 ID5142.C2808 ID5141.C1046 ID5141.C1596 ID5141.C2386 ID5141.C4990 ID5141.C7685 ID5141.C4888
Update: I was curious how difficult it would be to process the lines in a single pass, and found the following solution. This produces the lines in the clusters in the same order they appeared in the input, which is different from the order you listed them in your original post. If I don't sort to keep the original order, they come out in yet another order.
Update2: Fixed the sort so that the lines truly come out in the order they appeared in the input.
#!/usr/bin/perl use strict; use warnings; my @clusters; chomp (my @lines = <DATA>); foreach my $line (0..$#lines) { my @ids = $lines[$line] =~ m/\.(\S+)/g; my $in_cluster; foreach my $cluster (0..$#clusters) { if ( map { my $id = $_; grep { $id eq $_ } @ids } @{$clusters[ +$cluster]->{ids}} ) { if(defined($in_cluster)) { push(@{$clusters[$in_cluster]->{lines}}, @{$clusters[$ +cluster]->{lines}}); push(@{$clusters[$in_cluster]->{ids}}, @{$clusters[$cl +uster]->{ids}}); splice(@clusters,$cluster,1); last; } else { $in_cluster = $cluster; push(@{$clusters[$cluster]->{lines}}, $line); push(@{$clusters[$cluster]->{ids}}, @ids); } } } unless(defined($in_cluster)) { my $cluster = {}; push(@{$cluster->{lines}}, $line); push(@{$cluster->{ids}}, @ids); push(@clusters, $cluster); } } foreach my $cluster (0..$#clusters) { print "Cluster number " . ($cluster + 1) . ":\n"; foreach my $line (sort { $a <=> $b } @{$clusters[$cluster]->{lines +}}) { print "\t$lines[$line]\n"; } print "\n"; } exit(0); __DATA__ ID5141.C1665 ID5141.C2448 ID5141.C1253 ID5144.C2039 ID5141.C1596 ID5144.C1956 ID5141.C1906 ID5144.C2149 ID5141.C1221 ID5144.C1956 ID5141.C2149 ID5141.C2386 ID5141.C2039 ID5142.C1221 ID5141.C5887 ID5141.C7685 ID5141.C1005 ID5142.C2808 ID5141.C1046 ID5141.C1596 ID5141.C2386 ID5141.C4990 ID5141.C7685 ID5141.C4888
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: clustering pairs
by sugar (Beadle) on Dec 01, 2008 at 07:07 UTC | |
|
Re^2: clustering pairs
by sugar (Beadle) on Dec 03, 2008 at 01:58 UTC | |
by ig (Vicar) on Dec 05, 2008 at 04:36 UTC |