in reply to number of unique characters in a string
I tried to come up with my own solutions and benchmark them against the solutions proposed by others. I did expect the Inline::C version to be much quicker, but the difference is even bigger than I expected.
use Inline C => <<'*C*'; int UniqueCount(unsigned char *str) { char counts[256]; int i; int result; /* clear the array */ for (i = 0; i <= 255; i++) { counts[i] = (char) 0; } /* notice the characters */ while (*str) { counts[*str++] = 1; } /* now count the results */ result = 0; for (i = 0; i <= 255; i++) { result += counts[i]; } return result; } *C* #---------- sub withForeach { my $string = shift(); my %seen; $seen{$_} = 1 for (split //, $string); $count = scalar(keys %seen); } #-------- sub withRegexp2 { my $string = shift(); 1 while $string =~ s/(.)(.*?)\1/$1$2/g; length($string); } #------- sub withSlice { my $string = shift(); my %seen; @seen{split //, $string} = (); $count = scalar(keys %seen); } #-------- sub withFor { my $string = shift(); my %seen; for (my $i=0;$i<length($string);$i++) { $seen{substr($string,$i,1)} = 1; } $count = scalar(keys %seen); } #-------- sub withFor2 { my $string = shift(); my %seen; my $length = length($string); for (my $i=0;$i<$length;$i++) { $seen{substr($string,$i,1)} = 1; } $count = scalar(keys %seen); } #-------- sub withFor3 { my $string = shift(); my @seen; my $length = length($string); my $count = 0; for (my $i=0;$i<$length;$i++) { $count++ unless $seen[ord(substr($string,$i,1))]++; } $count; } #-------- sub withRegexp { my $string = shift(); my %seen; $string =~ s/(.)/$seen{$1} = 1;$_/eg; $count = scalar(keys %seen); } #--------- sub pfaut { my $l = ''; my $u; for (sort split(//,$_[0])) { $u++,$l=$_ if ($_ ne $l); } return $u; } #--------- sub withTr { my $txt = join('', sort(split //,shift)); $txt =~ tr/\x00-\xff//s; return length($txt); } #---------- sub nUniqC{ scalar grep{ ++$_[$_] == 1 } unpack('C*',$_[0]); } #-------- sub withVec { my $string = shift(); my $seen = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 +\0\0\0\0\0"; my $length = length($string); for (my $i=0;$i<$length;$i++) { vec($seen, ord(substr($string,$i,1)),1) = 1; } $count = unpack("%32b*", $seen); } #-------- sub withVecPack { my $seen = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 +\0\0\0\0\0"; for (unpack('C*',$_[0])) { vec($seen, $_, 1) = 1; } $count = unpack("%32b*", $seen); } #------------ #print "withFor=",withFor("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsdf +uh"),"\n"; #print "withVecPack=",withVecPack("4bisudbo34hlasnrgf08q3274huhsdfg0q3 +h24uhsdfuh"),"\n"; use Benchmark; timethese 100000, { "nUniqC" => 'nUniqC("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsdfuh +")', "withTr" => 'withTr("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsdfuh +")', "pfaut" => 'pfaut("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsdfuh") +', "withFor" => 'withFor("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsdf +uh")', "withFor2" => 'withFor2("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhs +dfuh")', "withFor3" => 'withFor3("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhs +dfuh")', "withVec" => 'withFor3("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsd +fuh")', "withVecPack" => 'withVecPack("4bisudbo34hlasnrgf08q3274huhsdfg0q3 +h24uhsdfuh")', "withSlice" => 'withSlice("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24u +hsdfuh")', "withForeach" => 'withForeach("4bisudbo34hlasnrgf08q3274huhsdfg0q3 +h24uhsdfuh")', "withRegexp" => 'withRegexp("4bisudbo34hlasnrgf08q3274huhsdfg0q3h2 +4uhsdfuh")', "withRegexp2" => 'withRegexp2("4bisudbo34hlasnrgf08q3274huhsdfg0q3 +h24uhsdfuh")', "UniqueCount" => 'UniqueCount("4bisudbo34hlasnrgf08q3274huhsdfg0q3 +h24uhsdfuh")', }; __END__ Benchmark: timing 100000 iterations of UniqueCount, nUniqC, pfaut, wit +hFor, with For2, withFor3, withForeach, withRegexp, withRegexp2, withSlice, withT +r, withVec , withVecPack... UniqueCount: 1 wallclock secs ( 0.27 usr + 0.00 sys = 0.27 CPU) @ 370370.37/s (n=100000) (warning: too few iterations for a reliable count) nUniqC: 6 wallclock secs ( 6.15 usr + 0.00 sys = 6.15 CPU) @ 16262.81/s (n=100000) pfaut: 17 wallclock secs (16.70 usr + 0.00 sys = 16.70 CPU) @ 5986.95/s (n=100000) withFor: 11 wallclock secs (10.52 usr + 0.01 sys = 10.53 CPU) @ 9492.17/s (n=100000) withFor2: 10 wallclock secs ( 9.96 usr + 0.00 sys = 9.96 CPU) @ 10036.13/s (n=100000) withFor3: 8 wallclock secs ( 8.14 usr + 0.00 sys = 8.14 CPU) @ 12283.50/s (n=100000) withForeach: 14 wallclock secs (15.49 usr + 0.01 sys = 15.50 CPU) @ 6451.20/s (n=100000) withRegexp: 23 wallclock secs (22.05 usr + 0.00 sys = 22.05 CPU) @ 4534.74/s (n=100000) withRegexp2: 27 wallclock secs (26.52 usr + 0.00 sys = 26.52 CPU) @ 3771.02/s (n=100000) withSlice: 12 wallclock secs (11.99 usr + 0.00 sys = 11.99 CPU) @ 8341.68/s (n=100000) withTr: 13 wallclock secs (13.05 usr + 0.00 sys = 13.05 CPU) @ 7663.42/s (n=100000) withVec: 9 wallclock secs ( 8.14 usr + 0.00 sys = 8.14 CPU) @ 12283.50/s (n=100000) withVecPack: 8 wallclock secs ( 8.02 usr + 0.00 sys = 8.02 CPU) @ 12467.27/s (n=100000)
Jenda
|
---|
Replies are listed 'Best First'. | |
---|---|
Re: BENCHMARK Re: number of unique characters in a string
by BrowserUk (Patriarch) on Mar 01, 2003 at 20:08 UTC | |
by Jenda (Abbot) on Mar 01, 2003 at 20:47 UTC | |
by BrowserUk (Patriarch) on Mar 01, 2003 at 20:53 UTC |