in reply to searching through data
Update:
Not any more!
As you can see, bit_vec2 is miles faster.
Rate baxy bin_chop citromatic hash array_slice bit_v +ec1 array_for vec bit_vec2 baxy 0.193/s -- -75% -89% -90% -94% - +96% -96% -97% -100% bin_chop 0.770/s 300% -- -56% -62% -77% - +83% -84% -86% -99% citromatic 1.74/s 806% 127% -- -13% -48% - +62% -63% -69% -97% hash 2.02/s 947% 162% 16% -- -40% - +56% -57% -64% -97% array_slice 3.38/s 1656% 339% 94% 68% -- - +25% -28% -40% -95% bit_vec1 4.54/s 2256% 489% 160% 125% 34% + -- -4% -20% -93% array_for 4.73/s 2355% 514% 171% 135% 40% + 4% -- -17% -93% vec 5.67/s 2846% 637% 225% 181% 68% +25% 20% -- -91% bit_vec2 63.5/s 32870% 8150% 3539% 3050% 1778% 13 +00% 1243% 1019% --
Full benchmarks of all techniques so far below
#!/usr/bin/perl use strict; use warnings FATAL => 'all'; use Benchmark 'cmpthese'; use Bit::Vector; #use Data::Dump 'pp'; use constant { MAX_NUM => 1_000_000, SEARCH_SIZE => 400_000, FIND_SIZE => 100, }; srand 1; my @searched = map { int(rand MAX_NUM) } 1 .. SEARCH_SIZE; my @to_find = map { int(rand MAX_NUM) } 1 .. FIND_SIZE; my %jobs = ( baxy => sub { my @present; for my $number (@to_find) { my @a = grep { $_ == $number } @searched; push @present, $number if @a; } return 0+@present; }, bin_chop => sub { my @present; my @sorted = sort {$a<=>$b} @searched; for my $number (@to_find) { push @present, $number unless bin_chop(\@sorted, $number) == -1; } return 0+@present; }, hash => sub { my @present; my %hash; undef @hash{@searched}; for my $number (@to_find) { push @present, $number if exists $hash{$number}; } return 0+@present; }, citromatic => sub { my @present; my $ids = 1_000_000; # last id my $bin = 0; substr($bin, $_, 1, pack ("c",0)) for (0..$ids); # Create the index substr($bin, $_, 1, pack ("c",1)) for @searched; # Search $ids from file 2 for my $id2 (@to_find) { push @present, $id2 if (unpack "c",substr ($bin,$id2,1)) == 1; } return 0+@present; }, vec => sub { my @present; my $vec=""; vec($vec, $_, 1) = 1 for @searched; @present = grep { vec $vec, $_, 1 } @to_find; return 0+@present; }, bit_vec1 => sub { my @present; my $vector = Bit::Vector->new(1000001); map { $vector->Bit_On($_) } @searched; for (@to_find) { push @present, $_, if $vector->bit_test($_); } return 0+@present; }, array_slice => sub { my @present; my @index; @index[@searched] = (1) x @searched; for (@to_find) { push @present, $_ if $index[$_]; } return 0+@present; }, array_for => sub { my @present; my @index; $index[$_] = 1 for @searched; for (@to_find) { push @present, $_ if $index[$_]; } return 0+@present; }, bit_vec2 => sub { my $vec1 = Bit::Vector->new(1000001); $vec1->Index_List_Store(@searched); my $vec2 = Bit::Vector->new(1000001); $vec2->Index_List_Store(@to_find); my $vec3 = Bit::Vector->new(1000001); $vec3->And($vec1, $vec2); my @present = $vec3->Index_List_Read(); return 0+@present; }, ); print "$_: ", $jobs{$_}->(), "\n" for sort keys %jobs; #exit; cmpthese(-20, \%jobs); sub bin_chop { my ($search, $number) = @_; my ($left, $right) = (0, $#$search); while ($right >= $left) { my $mid = int(($left+$right) / 2); if ($number < $search->[$mid]) { $right = $mid-1 } elsif ($number > $search->[$mid]) { $left = $mid+1 } else { return $mid } } return -1; }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: searching through data
by Anonymous Monk on Apr 16, 2009 at 19:26 UTC |