My first cut is fairly straight forward. I randomly generated a set of 100 points with uncertainty characteristics and distribution roughly similar to one of the 1-d data sets
#!/usr/bin/perl use strict; use warnings; my @points; for (1 .. 100) { my $x = rand()**2; my $y = function($x); my $dy = 0.01 + $x*0.1*rand()**2; $y += offset($dy); push @points, [map sprintf("%.4f", $_), $x, $y, $dy]; } $, = "\n"; $" = "\t"; print map("@$_", @points), "\n"; sub function { my ($x) = @_; return 0.25 + 0.75*(1-$x)**0.25; } sub offset { my ($x) = @_; return $x*(rand()-0.5)*exp(rand()); }
#!/usr/bin/perl use strict; use warnings; my @points; push @points, [split] while <DATA>; $_ = {x => $_->[0], 'y' => $_->[1], dy => $_->[2]} for @points; for my $point (@points) { $point->{value} = $point->{y}/$point->{dy}; # Low relative uncerta +inty is good $point->{dist} += ($_->{x} - $point->{x})**2 for @points; # Isolat +ed points are good } while (@points > 50) { my $low_score = score($points[0]); my $low_index = 0; for my $i (0 .. $#points) { if (score($points[$i]) < $low_score) { $low_score = score($points[$i]); $low_index = $i; } } my $low_point = splice @points, $low_index, 1; $_->{dist} -= ($_->{x} - $low_point->{x})**2 for @points; } for my $point (@points) { print join "\t", $point->{x}, $point->{y}, $point->{dy}, "\n"; } sub score { my ($point) = @_; return $point->{value} + 1000*$point->{dist}; } __DATA__ 0.3137 0.9230 0.0213 0.0353 0.9927 0.0028 0.5431 0.8256 0.0893 0.4871 0.8657 0.0098 0.5935 0.8637 0.0645 0.5340 0.7873 0.0932 0.1667 0.9532 0.0062 0.0241 0.9978 0.0070 0.1068 0.9785 0.0142 0.3129 0.9014 0.0286 0.3319 0.9078 0.0028 0.0006 0.9791 0.0429 0.1436 0.9697 0.0285 0.0925 0.9746 0.0053 0.1347 0.9641 0.0024 0.4304 0.8877 0.0364 0.1306 0.9987 0.0502 0.4056 0.8653 0.0634 0.0539 1.0031 0.0352 0.0468 0.9217 0.0943 0.0397 0.9802 0.0503 0.1994 0.9252 0.0486 0.0345 0.9916 0.0013 0.5689 0.8306 0.0040 0.2189 1.0476 0.0927 0.4173 0.8850 0.0149 0.8645 0.6728 0.0556 0.0586 0.9900 0.0819 0.4643 0.7884 0.0659 0.3554 0.9173 0.0410 0.4400 0.9074 0.0831 0.0382 1.0116 0.0461 0.0188 0.9934 0.0069 0.1505 0.9898 0.0540 0.3959 0.8792 0.0195 0.5443 0.8225 0.0148 0.9776 0.5572 0.0651 0.2192 0.9609 0.0370 0.4869 0.8621 0.0211 0.0499 0.9876 0.0011 0.2295 0.9325 0.0204 0.1222 0.9540 0.0418 0.2911 0.9215 0.0015 0.6095 0.8270 0.0235 0.2518 0.8968 0.0456 0.1540 0.9798 0.0529 0.0344 1.0038 0.0227 0.1534 0.9149 0.0608 0.8540 0.6901 0.0060 0.8015 0.7142 0.0249 0.0008 1.0756 0.0734 0.1346 0.9768 0.0183 0.1167 0.9697 0.0013 0.9690 0.5879 0.0023 0.3193 0.9115 0.0036 0.1110 1.0015 0.0546 0.0373 1.0053 0.0231 0.0016 0.9734 0.0946 0.1820 0.9648 0.0209 0.0091 0.9976 0.0013 0.1699 0.9791 0.0343 0.4183 0.8819 0.0010 0.2560 0.9270 0.0091 0.9714 0.5897 0.0251 0.0014 0.9733 0.0304 0.0940 0.9597 0.0524 0.0501 0.9869 0.0530 0.3157 0.9057 0.0220 0.2149 0.9412 0.0047 0.1635 0.9578 0.0011 0.0581 1.0069 0.0311 0.5995 0.8179 0.0056 0.0625 0.9535 0.0919 0.0262 1.0081 0.0191 0.0267 1.0211 0.0902 0.9126 0.6475 0.0020 0.4535 0.8672 0.0085 0.3254 0.9136 0.0065 0.0019 0.9989 0.0592 0.0639 0.9791 0.0083 0.1951 0.9082 0.0838 0.2899 0.9061 0.0741 0.5317 0.8421 0.0010 0.1340 0.9659 0.0012 0.0617 0.9754 0.0609 0.5550 0.8363 0.0087 0.1242 0.9704 0.0071 0.4895 0.8538 0.0040 0.0087 0.9990 0.0064 0.8741 0.6770 0.0111 0.2771 0.9227 0.0024 0.0887 0.9354 0.0545 0.2974 0.9186 0.0010 0.4166 0.8802 0.0093 0.0010 1.0137 0.0992 0.0298 0.9960 0.0119 0.1383 0.9629 0.0034 0.3617 0.8983 0.0018 0.0035 0.9874 0.0340 0.7896 0.7058 0.0764
And it fails terribly. With a relatively even weighting, the low uncertainty points win out and I get a strong cluster at low x. When I increase the weight on the distance term, representation at high x improves, but I end up with big holes in the middle of the graph. Any ideas on how to improve this algorithm? The ideas need to be extensible to higher dimensionality. Speed is not paramount, but is always appreciated.
In reply to Picking the best points by kennethk
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |