#!/usr/bin/perl # # a clustering algorithm # use strict; use warnings; use Data::Dumper; $|=1; my $begin = time; my $items; my %cache; my $arr_cnt; srand(0); build_items(1700, 5, 100, 10000); #build_items(500, 5, 50, 2500); # Build cache my $difference = 9999; #Arbitrary large number for my $i ( 0 .. $#$items-1 ) { my $d1 = $items->[$i]; for my $j ( $i+1 .. $#$items ) { my $d2 = $items->[$j]; my $diff = max_diff( $d1, $d2 ); } } my $cur = time - $begin; print "cache built, $cur s\n"; # Build stitch list: i.e. list of things to tie together, by ordering the cache by distance my @stitch; for my $k1 (keys %cache) { for my $k2 (keys %{$cache{$k1}}) { push @stitch, [ $k1, $k2, $cache{$k1}{$k2} ]; } } @stitch = sort { $a->[2]<=>$b->[2] || $a->[0] cmp $b->[0] || $a->[1] cmp $b->[1] } @stitch; $cur = time - $begin; print "stitch list built, $cur s\n"; # build the clusters my %clusters; my %sretsulc; my $clust_cnt=0; for my $idx (0 .. $#stitch) { my ($k1, $k2, $d) = @{$stitch[$idx]}; my $fl='N'; my ($cl1, $cl2, $msg); $cl1 = $sretsulc{$k1} if exists $sretsulc{$k1}; $cl2 = $sretsulc{$k1} if exists $sretsulc{$k2}; if (!defined $cl1) { # Remove one special case, leaving: CL+CL, CL+K, K+K ($k1, $k2, $cl1, $cl2) = ($k2, $k1, $cl2, $cl1); $fl='Y'; } $msg = "$idx: Distance $d ($k1 <-$fl-> $k2) "; if (defined $cl1 and defined $cl2) { if ($cl1 eq $cl2) { #print "\t$k1 and $k2 are in same cluster ($cl1)\n"; next; } $msg .= "\tJoining $cl1 ($k1) and $cl2 ($k2)"; ++$clust_cnt; my $cl3 = "clust $clust_cnt"; $clusters{$cl3}{L} = $cl1; $clusters{$cl3}{R} = $cl2; $clusters{$cl1}{P} = $cl3; $clusters{$cl2}{P} = $cl3; my $size=0; for my $k (keys %sretsulc) { my $cl = $sretsulc{$k}; if ($cl eq $cl1 or $cl eq $cl2) { $sretsulc{$k} = $cl3; ++$size; } } $msg .= " new cluster: $size items"; } elsif (defined $cl1) { # build new cluster of cl1 and k2 ++$clust_cnt; my $cl3 = "clust $clust_cnt"; $msg .= "\tjoining $cl1 and $k2 into $cl3"; $clusters{$cl3}{L} = $cl1; $clusters{$cl3}{R} = $k2; $clusters{$cl1}{P} = $cl3; my $size=0; for my $k (keys %sretsulc) { my $cl = $sretsulc{$k}; if ($cl eq $cl1) { $sretsulc{$k} = $cl3; ++$size; } } $sretsulc{$k2}=$cl3; ++$size; $msg .= " new cluster: $size items"; } else { # Two unclustered items ++$clust_cnt; my $cl3 = "clust $clust_cnt"; $msg .= "\tjoining $k1 and $k2 into $cl3"; $clusters{$cl3}{L} = $k1; $clusters{$cl3}{R} = $k2; $sretsulc{$k1}=$cl3; $sretsulc{$k2}=$cl3; } print $msg, "\n"; } $cur = time - $begin; print "clusters built, $cur s\n"; sub merge { my( $x, $y ) = @_; # Both non-clusters if( ref $x eq 'HASH' and ref $y eq 'HASH' ) { ++$arr_cnt; print "\tmerging hashes $x->{name} and $y->{name} into \n"; return [$x,$y, "arr $arr_cnt" ]; } # $x cluster elsif( ref $x eq 'ARRAY' and ref $y eq 'HASH' ) { ++$arr_cnt; print "\tmerging $x->[2] and $y->{name} into \n"; return [$x,$y, "arr $arr_cnt" ]; } # $y cluster elsif( ref $x eq 'HASH' and ref $y eq 'ARRAY' ) { ++$arr_cnt; print "\tmerging $x->{name} and $y->[2] into \n"; return [$y,$x, "arr $arr_cnt" ]; } elsif( ref $x eq 'ARRAY' and ref $y eq 'ARRAY' ) { ++$arr_cnt; print "\tmerging $x->[2] and $y->[2] into \n"; return [$x,$y, "arr $arr_cnt" ]; } else { die "Wtf? $x $y"; } } sub max_diff { my( $d1, $d2 ) = @_; if( ref $d1 eq 'HASH' and ref $d2 eq 'HASH' ) { my ($name1,$name2) = ($d1->{name}, $d2->{name}); ($name1,$name2) = ($name1 lt $name2) ? ($name1, $name2) : ($name2, $name1); if (exists $cache{$name1}{$name2}) { return $cache{$name1}{$name2}; } my $t=0; for (keys %{$d1->{words}}) { ++$t if ! exists $d2->{words}{$_} } for (keys %{$d2->{words}}) { ++$t if ! exists $d1->{words}{$_} } $cache{$name1}{$name2} = $t; return $t; } elsif( ref $d1 eq 'ARRAY' and ref $d2 eq 'HASH' ) { my $x = max_diff( $d1->[0], $d2 ); my $y = max_diff( $d1->[1], $d2 ); return $x > $y ? $x : $y; } elsif( ref $d1 eq 'HASH' and ref $d2 eq 'ARRAY' ) { my $x = max_diff( $d2->[0], $d1 ); my $y = max_diff( $d2->[1], $d1 ); return $x > $y ? $x : $y; } elsif( ref $d1 eq 'ARRAY' and ref $d2 eq 'ARRAY' ) { my $x = max_diff( $d1->[0], $d2->[0] ); my $y = max_diff( $d1->[1], $d2->[1] ); my $xx = max_diff( $d1->[0], $d2->[1] ); my $yy = max_diff( $d1->[1], $d2->[0] ); return max( $x, $y, $xx, $yy ); } else { die "Wtffffff $d1 $d2"; } } sub max { my ($ret, @t) = @_; for (@t) { $ret = $_ if $_ > $ret; } return $ret; } sub build_items { # Build an array of $num_items, where each item is $it_min - $it_max distinct (non-dup) words my ($num_items, $it_min, $it_max, $num_words) = @_; $num_words = $it_min * $num_items if ! defined $num_words; # Read the dictionary my @words; { my %words; open my $FH, '<', '/etc/dictionaries-common/words'; while (<$FH>) { s/\s+$//; next if /'s$/; $words{$_}=0; } @words = keys %words; } print "Dictionary had ", scalar(@words), " words\n"; # Make a list of words { my %dict; while (keys %dict < $num_words) { my $idx = int(rand()*@words); $dict{$words[$idx]}=0; $words[$idx] = pop @words; } @words = keys %dict; print "Trimmed dictionary had ", scalar(@words), " words\n"; } for my $it_idx (0 .. $num_items-1) { my %item_words; my $it_cnt = $it_min + int(rand()*($it_max-$it_min)); for (1 .. $it_cnt) { $item_words{$words[int rand()*@words]}=0; } $$items[$it_idx] = { words => { map { $_=>0 } keys %item_words }, name => "item $it_idx", }; } }