about the k-Means Clustering algorithm, modules, tutorials and documentation. I had promised him to port the algorithm out of a VB demo program belonging to a
by Kardi Teknomo. I found the VB program so illustrative, that I ported the whole program to Perl/Tk, even though I've never actually used Tk for anything real. As a result, it took me a few days.
Anyway! The result is here, the kMeans module is part of the script, but you can use it as a general library for kMeans clustering too, if you'd want to. It's not even limited to 2D. :) All you have to do is copy the bottom half of the program into its own pm file.
#!/usr/bin/perl -w
# k Means demo program
# Adapted from a VB program by Kardi Teknomo
# http://people.revoledu.com/kardi/tutorial/kMean/index.html
# Ported to Perl/Tk by bart @ Perlmonks
use strict;
# ----------------------- Tk interface -------------------------
use Tk;
my $mw = MainWindow->new( -height => 403, -width => 477,
-title => "k Means Clustering, adapted from tutorial by Kardi Teknom
+o" );
my $button_reset = $mw->Button(-text => "Clear Data", -relief => "rais
+ed", -command => \&reset_click);
$button_reset->place( -x => 176, -y => 24, -height => 25, -width => 65
+);
{
my $label = $mw->Label(
-text => "Click data in the canvas below. The program will autom
+atically cluster the data by color code."
);
$label->place( -x => 0, -y => 0, -height => 16, -width => 473);
$label = $mw->Label(-text => "Number of clusters");
$label->place( -x => 10, -y => 28, -height => 18, -width => 95);
$label = $mw->Label(-text => "(X, Y)", -justify => 'right');
$label->place( -x => 280, -y => 24, -height => 13, -width => 40);
}
my $label_xy = $mw->Label( -text => "X, Y");
$label_xy->place( -x => 330, -y => 24, -height => 13, -width => 50);
my $clusters_entry = $mw->Entry(-relief => "sunken", -disabledforegrou
+nd => 'darkgray');
$clusters_entry->place( -x => 112, -y => 24, -height => 24, -width =>
+24);
$clusters_entry->insert('end', '3');
my $canvas = $mw->Scrolled('Canvas', -scrollbars => '', -background =>
+ 'white');
$canvas->place( -x => 0, -y => 56, -height => 403-56, -width => 477);
$canvas->CanvasBind( "<ButtonPress-1>", [ \&canvas_click, Ev('x'), Ev(
+'y')]);
$canvas->CanvasBind( "<Motion>", [ \&canvas_mousemove, Ev('x'), Ev('y'
+)]);
$canvas->CanvasBind( "<Leave>", [ \&canvas_mouseleave ]);
#----------------------- Event Handlers ------------------------
my @color = qw(red yellow green cyan blue purple gray magenta pink
chartreuse coral darkolivegreen);
# If you want to be able to have more clusters, add more colours.
my(%point, @cluster, $dataset);
sub reset_click {
$canvas->delete('all');
(@cluster, %point, $dataset) = ();
$clusters_entry->configure(-state => 'normal');
}
sub canvas_mousemove {
my $canvas = shift;
my($x, $y) = @_;
$label_xy->configure(-text => "($x, $y)");
}
sub canvas_mouseleave {
my $canvas = shift;
$label_xy->configure(-text => "");
}
sub canvas_click {
my $canvas = shift;
my($x, $y) = @_;
if(!$dataset) {
my $clusters = $clusters_entry->get;
if($clusters !~ /^\d+$/ or $clusters == 0 or $clusters > @colo
+r) {
warn "Not a valid value for cluster count";
return;
}
$dataset = Data::Cluster::kMean->new(0+$clusters) or die "Fai
+led to make object";
$clusters_entry->configure(-state => 'disabled');
}
$dataset->add(my $point = [ $x, $y ]);
# A point is an array ref with coordinates
my %record = ( data => $point, cluster => -1 );
$record{id} = $canvas->createLine($x, $y, $x, $y,
-fill => 'red', -width => 8,
-capstyle => 'round', -tags => ['dot'],
);
# Keep track of point properties using a stringified reference to
+the point coordinates array
$point{$point} = \%record;
foreach my $cluster ($dataset->clusters) {
my $i = $cluster->index;
my $r = $cluster[$i] ||= { obj => $cluster, id =>
$canvas->createText($x, $y, -anchor => 'c', -width => 150, t
+ag => 'label',
-text => 1+$i) };
# Move centroid label
my($x, $y) = @{$cluster->centroid};
$canvas->coords($r->{id}, $x, $y);
# Colour dots according to cluster
foreach my $p ($cluster->points) {
if($point{$p}{cluster} != $i) {
$point{$p}{cluster} = $i;
$canvas->itemconfigure($point{$p}{id}, -fill => $color
+[$i]);
}
}
}
$canvas->raise('label', 'dot');
}
#------------------------ Main Program -------------------------
MainLoop;
#------------------------ kMean module -------------------------
package Data::Cluster::kMean;
use List::Util qw(sum);
sub new {
my $class = shift;
my($max_clusters) = @_;
return bless { max_clusters => $max_clusters, data => [], cluster
+=> [], clusters => [] }, $class;
}
sub add {
# add data point(s) (array references) -- by reference, so make su
+re they're not reused for something else
my $self = shift;
return unless @_;
unless(ref $_[0] eq 'ARRAY') {
@_ = [ @_ ];
}
foreach my $p (@_) {
push @{$self->{data}}, $p;
push @{$self->{cluster}}, -1; # not in a cluster
if(@{$self->{clusters}} < $self->{max_clusters}) {
my $index = @{$self->{clusters}};
push @{$self->{clusters}}, Data::Cluster::kMean::Cluster->
+new($self, $index);
$self->{cluster}[-1] = $index;
} else {
my $c;
{
my $j = 0;
my $min_dist;
for my $cluster (@{$self->{clusters}}) {
my $dist = _dist($p, $cluster->centroid);
if(!defined $min_dist or $dist < $min_dist) {
$c = $j;
$min_dist = $dist;
}
} continue {
$j++;
}
}
$self->{clusters}[$c]->invalidate;
$self->{cluster}[-1] = $c;
my $is_still_moving = 1;
while($is_still_moving) {
# this loop will surely converge
my @centroid = map $_->centroid, @{$self->{clusters}};
# assign all data to the new centroids
$is_still_moving = 0;
my $i = 0;
for my $p (@{$self->{data}}) {
my $c;
{
my $min_dist;
for my $j (0 .. $#{$self->{clusters}}) {
my $dist = _dist($p, $centroid[$j]);
if(!defined $min_dist or $dist < $min_dist
+) {
$c = $j;
$min_dist = $dist;
}
}
}
if($c != $self->{cluster}[$i]) {
$self->{clusters}[$self->{cluster}[$i]]->inval
+idate;
$self->{clusters}[$c]->invalidate;
$self->{cluster}[$i] = $c;
$is_still_moving = 1;
}
} continue {
$i++;
}
}
}
}
}
sub clusters {
# Returns a list of all Cluster objects
my $self = shift;
return @{$self->{clusters}};
}
sub _dist {
# function
return sqrt(sum map { my $d = $_[0][$_]-$_[1][$_]; $d*$d } 0 .. $#
+{$_[0]});
}
package Data::Cluster::kMean::Cluster;
use List::Util qw(sum);
sub new {
my $class = shift;
my($parent, $index) = @_;
bless { index => $index, data => $parent->{data}, cluster => $pare
+nt->{cluster}, centroid => undef}, $class;
}
sub points {
# Returns a list of all points in cluster
my $self = shift;
my $index = $self->{index};
my @point = @{$self->{data}}[grep $self->{cluster}[$_] == $index,
+0 .. $#{$self->{data}}];
return @point;
}
sub centroid {
# Returns a point indicating the cluster's center of gravity
my $self = shift;
return $self->{centroid} ||= _centroid($self->points);
}
sub _centroid {
# function
return undef unless @_;
my $dim = @{$_[0]};
return [ map { my $i = $_; sum(map $_->[$i], @_) / @_ } 0 .. $dim-
+1 ];
}
sub invalidate {
# Throw away cache
my $self = shift;
undef $self->{centroid};
}
sub index {
# integer, position in cluster array of parent
my $self = shift;
return $self->{index};
}
1;