I tried to come up with my own solutions and benchmark them against the solutions proposed by others. I did expect the Inline::C version to be much quicker, but the difference is even bigger than I expected.

use Inline C => <<'*C*'; int UniqueCount(unsigned char *str) { char counts[256]; int i; int result; /* clear the array */ for (i = 0; i <= 255; i++) { counts[i] = (char) 0; } /* notice the characters */ while (*str) { counts[*str++] = 1; } /* now count the results */ result = 0; for (i = 0; i <= 255; i++) { result += counts[i]; } return result; } *C* #---------- sub withForeach { my $string = shift(); my %seen; $seen{$_} = 1 for (split //, $string); $count = scalar(keys %seen); } #-------- sub withRegexp2 { my $string = shift(); 1 while $string =~ s/(.)(.*?)\1/$1$2/g; length($string); } #------- sub withSlice { my $string = shift(); my %seen; @seen{split //, $string} = (); $count = scalar(keys %seen); } #-------- sub withFor { my $string = shift(); my %seen; for (my $i=0;$i<length($string);$i++) { $seen{substr($string,$i,1)} = 1; } $count = scalar(keys %seen); } #-------- sub withFor2 { my $string = shift(); my %seen; my $length = length($string); for (my $i=0;$i<$length;$i++) { $seen{substr($string,$i,1)} = 1; } $count = scalar(keys %seen); } #-------- sub withFor3 { my $string = shift(); my @seen; my $length = length($string); my $count = 0; for (my $i=0;$i<$length;$i++) { $count++ unless $seen[ord(substr($string,$i,1))]++; } $count; } #-------- sub withRegexp { my $string = shift(); my %seen; $string =~ s/(.)/$seen{$1} = 1;$_/eg; $count = scalar(keys %seen); } #--------- sub pfaut { my $l = ''; my $u; for (sort split(//,$_[0])) { $u++,$l=$_ if ($_ ne $l); } return $u; } #--------- sub withTr { my $txt = join('', sort(split //,shift)); $txt =~ tr/\x00-\xff//s; return length($txt); } #---------- sub nUniqC{ scalar grep{ ++$_[$_] == 1 } unpack('C*',$_[0]); } #-------- sub withVec { my $string = shift(); my $seen = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 +\0\0\0\0\0"; my $length = length($string); for (my $i=0;$i<$length;$i++) { vec($seen, ord(substr($string,$i,1)),1) = 1; } $count = unpack("%32b*", $seen); } #-------- sub withVecPack { my $seen = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 +\0\0\0\0\0"; for (unpack('C*',$_[0])) { vec($seen, $_, 1) = 1; } $count = unpack("%32b*", $seen); } #------------ #print "withFor=",withFor("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsdf +uh"),"\n"; #print "withVecPack=",withVecPack("4bisudbo34hlasnrgf08q3274huhsdfg0q3 +h24uhsdfuh"),"\n"; use Benchmark; timethese 100000, { "nUniqC" => 'nUniqC("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsdfuh +")', "withTr" => 'withTr("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsdfuh +")', "pfaut" => 'pfaut("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsdfuh") +', "withFor" => 'withFor("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsdf +uh")', "withFor2" => 'withFor2("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhs +dfuh")', "withFor3" => 'withFor3("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhs +dfuh")', "withVec" => 'withFor3("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24uhsd +fuh")', "withVecPack" => 'withVecPack("4bisudbo34hlasnrgf08q3274huhsdfg0q3 +h24uhsdfuh")', "withSlice" => 'withSlice("4bisudbo34hlasnrgf08q3274huhsdfg0q3h24u +hsdfuh")', "withForeach" => 'withForeach("4bisudbo34hlasnrgf08q3274huhsdfg0q3 +h24uhsdfuh")', "withRegexp" => 'withRegexp("4bisudbo34hlasnrgf08q3274huhsdfg0q3h2 +4uhsdfuh")', "withRegexp2" => 'withRegexp2("4bisudbo34hlasnrgf08q3274huhsdfg0q3 +h24uhsdfuh")', "UniqueCount" => 'UniqueCount("4bisudbo34hlasnrgf08q3274huhsdfg0q3 +h24uhsdfuh")', }; __END__ Benchmark: timing 100000 iterations of UniqueCount, nUniqC, pfaut, wit +hFor, with For2, withFor3, withForeach, withRegexp, withRegexp2, withSlice, withT +r, withVec , withVecPack... UniqueCount: 1 wallclock secs ( 0.27 usr + 0.00 sys = 0.27 CPU) @ 370370.37/s (n=100000) (warning: too few iterations for a reliable count) nUniqC: 6 wallclock secs ( 6.15 usr + 0.00 sys = 6.15 CPU) @ 16262.81/s (n=100000) pfaut: 17 wallclock secs (16.70 usr + 0.00 sys = 16.70 CPU) @ 5986.95/s (n=100000) withFor: 11 wallclock secs (10.52 usr + 0.01 sys = 10.53 CPU) @ 9492.17/s (n=100000) withFor2: 10 wallclock secs ( 9.96 usr + 0.00 sys = 9.96 CPU) @ 10036.13/s (n=100000) withFor3: 8 wallclock secs ( 8.14 usr + 0.00 sys = 8.14 CPU) @ 12283.50/s (n=100000) withForeach: 14 wallclock secs (15.49 usr + 0.01 sys = 15.50 CPU) @ 6451.20/s (n=100000) withRegexp: 23 wallclock secs (22.05 usr + 0.00 sys = 22.05 CPU) @ 4534.74/s (n=100000) withRegexp2: 27 wallclock secs (26.52 usr + 0.00 sys = 26.52 CPU) @ 3771.02/s (n=100000) withSlice: 12 wallclock secs (11.99 usr + 0.00 sys = 11.99 CPU) @ 8341.68/s (n=100000) withTr: 13 wallclock secs (13.05 usr + 0.00 sys = 13.05 CPU) @ 7663.42/s (n=100000) withVec: 9 wallclock secs ( 8.14 usr + 0.00 sys = 8.14 CPU) @ 12283.50/s (n=100000) withVecPack: 8 wallclock secs ( 8.02 usr + 0.00 sys = 8.02 CPU) @ 12467.27/s (n=100000)

Jenda


In reply to BENCHMARK Re: number of unique characters in a string by Jenda
in thread number of unique characters in a string by Anonymous Monk

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.