Re: Question about speeding a regexp count

Benchmarks:

          s/iter limbic injun radiant ikegami radiant_i sauoq skeeve3 
+sauoq_i skeeve3_i skeeve2
limbic      3.35     --   -1%    -34%    -47%      -56%  -71%    -77% 
+   -77%      -81%    -91%
injun       3.33     1%    --    -34%    -46%      -56%  -71%    -77% 
+   -77%      -81%    -91%
radiant     2.21    52%   51%      --    -19%      -33%  -56%    -65% 
+   -66%      -71%    -87%
ikegami     1.79    87%   86%     23%      --      -18%  -46%    -57% 
+   -58%      -64%    -84%
radiant_i   1.47   128%  126%     50%     22%        --  -34%    -48% 
+   -48%      -56%    -80%
sauoq      0.969   246%  244%    128%     85%       52%    --    -21% 
+   -22%      -34%    -70%
skeeve3    0.769   336%  333%    187%    133%       91%   26%      -- 
+    -1%      -17%    -62%
sauoq_i    0.759   342%  339%    191%    136%       94%   28%      1% 
+     --      -16%    -61%
skeeve3_i  0.641   423%  420%    244%    180%      130%   51%     20% 
+    19%        --    -54%
skeeve2    0.294  1042% 1035%    651%    511%      401%  230%    162% 
+   159%      118%      --
[download]

Benchmarks code:

use strict;
use warnings;

use Benchmark qw( cmpthese );


# Use constant to provide comparable results.
use constant GROUP_LENGTH => 3;


# Generate a random string all tests will use.
my $seq = join '',
          map { (qw[ A C G T ])[rand(4)] }
          1..600_000;


# ---


sub sauoq {
   my %count;

   my $length = length $seq;
   for my $i (0..$length-1) {
      ++$count{substr($seq, $i, 1)};
      ++$count{substr($seq, $i, 2)} if $i < $length - 1;
      ++$count{substr($seq, $i, 3)} if $i < $length - 2;
   }

   return \%count;
}


sub sauoq_i {
   my %count;

   my $length = length($seq);
   for my $i (0..$length-3) {
      ++$count{substr($seq, $i, 1)};
      ++$count{substr($seq, $i, 2)};
      ++$count{substr($seq, $i, 3)};
   }

   ++$count{substr($seq, -2, 2)};
   ++$count{substr($seq, -2, 1)};
   ++$count{substr($seq, -1, 1)};

   return \%count;
}


sub limbic {
   my %count;
   my $template = ('AXA2X2A3X2' x (length($seq) - 2)) . 'AXA2XA';
   ++$count{$_} foreach unpack $template, $seq;
   return \%count;
}


sub injun {
   my %count;

   my @bases = split //, $seq;

   for my $i (0..$#bases) {
      ++$count{$bases[$i]                          };
      ++$count{$bases[$i].$bases[$i+1]             } if defined $bases
+[$i+1];
      ++$count{$bases[$i].$bases[$i+1].$bases[$i+2]} if defined $bases
+[$i+2];
   }

   return \%count;
}


sub ikegami {
   my %count;
   ++$count{$1} while $seq =~ /(?=(.))/g;
   ++$count{$1} while $seq =~ /(?=(..))/g;
   ++$count{$1} while $seq =~ /(?=(...))/g;
   return \%count;
}


sub skeeve2 {
   my %count;

   for my $i (0..length($seq)-GROUP_LENGTH()) {
      ++$count{substr($seq, $i, GROUP_LENGTH)};
   }

   my @keys = keys %count;
   foreach my $key (@keys) {
      for my $i (1..GROUP_LENGTH-1) {
         $count{substr($key, 0, $i)} += $count{$key};
      }
   }

   for my $i (1..GROUP_LENGTH-1) {
      for my $j ($i..GROUP_LENGTH-1) {
         ++$count{substr($seq, -$j, $i)};
      }
   }

   return \%count;
}


sub skeeve3 {
   my %count;

   ++$count{"$1$2"} while $seq =~ /(.)(?=(..))/g;

   my @keys = keys %count;
   foreach my $key (@keys) {
      $count{substr($key, 0, 1)} += $count{$key};
      $count{substr($key, 0, 2)} += $count{$key};
   }

   ++$count{substr($seq, -2, 2)};
   ++$count{substr($seq, -2, 1)};
   ++$count{substr($seq, -1, 1)};

   return \%count;
}


sub skeeve3_i {
   my %count;

   $count{$1}++ while $seq =~ /(?=(...))/g;

   my @keys = keys %count;
   foreach my $key (@keys) {
      $count{substr($key, 0, 1)} += $count{$key};
      $count{substr($key, 0, 2)} += $count{$key};
   }

   ++$count{substr($seq, -2, 2)};
   ++$count{substr($seq, -2, 1)};
   ++$count{substr($seq, -1, 1)};

   return \%count;
}


sub radiant {
   my %count;

   my @rolling;
   $#rolling = 2;

   for my $i (0..length($seq)-1) {
      my $char = substr($seq, $i, 1);

      shift @rolling;
      push @rolling, $char;

      $count{$rolling[2]}++;            #one-char count

      next if not defined $rolling[1];
      $count{join('',@rolling[1,2])}++; #two-char count

      next if not defined $rolling[0]; 
      $count{join('',@rolling)}++;      #three-char count
   }

   return \%count;
}


sub radiant_i {
   my %count;

   my @rolling;
   $#rolling = 2;

   for my $i (0..length($seq)-1) {
      my $char = substr($seq, $i, 1);

      shift @rolling;
      push @rolling, $char;

      next if $i < 2;
      $count{join('', @rolling)}++;
   }

   my @keys = keys %count;
   foreach my $key (@keys) {
      $count{substr($key, 0, 1)} += $count{$key};
      $count{substr($key, 0, 2)} += $count{$key};
   }

   ++$count{substr($seq, -2, 2)};
   ++$count{substr($seq, -2, 1)};
   ++$count{substr($seq, -1, 1)};

   return \%count;
}


# ---


sub hash_count {
   my ($count) = @_;

   my $total = 0;
   $total += $_ foreach values %$count;

   return join ',',
          "total=>$total", 
          map { sprintf("%s=>%s", $_, $count->{$_}) }
          sort { length($a) <=> length($b) || $a cmp $b }
          keys %$count;
}


my @tests = qw(
   sauoq
   sauoq_i
   limbic
   injun
   ikegami
   skeeve2
   skeeve3
   skeeve3_i
   radiant
   radiant_i
);


{
   my $ok = 1;

   my @results = map { hash_count((\&$_)->()) } @tests;

   for my $i (0..$#tests) {
      for my $j (0..$#tests) {                # or start at $i+1
         if ($results[$i] ne $results[$j]) {
            $ok = 0;
            print("$tests[$j] differs from $tests[$i].\n");
         }
      }
   }

   if ($ok) {
      print("No differences encountered.\n");
      print("\n");
   } else {
      print("\n");
      print("Differences encountered.\n");
      print("\$seq: $seq\n");
      print("$tests[$_]: $results[$_]\n") for (0..$#tests);
      die("\n");
   }
}


cmpthese(5, { map { $_ => \&$_ } @tests });
[download]

I edited the solutions to use the same style and to work from memory. I removed any extra work which could be easily added to any solution (such as validation checks).

I thoroughly checked all the solutions. All of produce indentical results.

I only ran the tests for 5 iterations because it's slow, but I ran it many times. All results where statistically identical.

The solutions by BrowserUK, InjunJoel and particularly Limbic~Region use much more memory than the others. All solutions expect the data to be fully in memory.

BrowserUK's solution requires Perl 5.8+.

Update: Reran on 5.8 to time BrowserUK.

Update: Added Skeeve's solutions, now that they are complete.

Update: Added RadiantMatrix's solution.

Update: Added Skeeve's bugfixes. Thoroughly verified the results of every test. Removed BrowserUK's solution since it was buggy. Applied hv's microoptimizations

Comment on Re: Question about speeding a regexp count Select or Download Code

Replies are listed 'Best First'.
Re^2: Question about speeding a regexp count by hv (Prior) on Oct 14, 2005 at 16:12 UTC
You can squeeze out some micro-optimisations. In general, `++$x` is slightly faster than `$x++`; playing with skeeve3_i, I also found that `/.../` was faster than `/.{3}/` (to an extent that surprised me). `Rate l3epost l3epre l3ipost l3ipre l3epost 42.2/s -- -3% -8% -10% l3epre 43.4/s 3% -- -5% -8% l3ipost 45.8/s 9% 6% -- -3% l3ipre 47.1/s 12% 9% 3% --` [download] is the output from this code: `use strict; use Benchmark qw/ cmpthese /; our $a = "abcdefghij" x 1000; cmpthese(-1, { l3epost => q{ my %a; $a{$1}++ while $a =~ /(?=(.{3}))/g }, l3epre => q{ my %a; ++$a{$1} while $a =~ /(?=(.{3}))/g }, l3ipost => q{ my %a; $a{$1}++ while $a =~ /(?=(...))/g }, l3ipre => q{ my %a; ++$a{$1} while $a =~ /(?=(...))/g }, });` [download] It shouldn't be hard to improve perl to compile `$x++` to a preincrement in void context; fixing the regexp engine to make an explicit fixed count as fast as unrolling is likely to be harder. Update: Now I'm confused: I checked the source, and void postincrement should be getting replaced with preincrement at compile time; I definitely don't understand now why explicit preinc is showing consistently faster times. Hugo	[reply] [d/l] [select]
Re^2: Question about speeding a regexp count by Commander Salamander (Acolyte) on Oct 13, 2005 at 20:42 UTC
Very interesting. Looks like I should have gone with sauoq :). Thanks to everyone for their assistance!	[reply]
Re^2: Question about speeding a regexp count by Skeeve (Parson) on Oct 13, 2005 at 21:58 UTC
I'd love to see results with 6_500_000 compared to my code "a1" which is reading 1kb chuncks of data from disk. ;-) `s$$([},&%#}/&/]+}%&{});#$&&s&&$^X.($'^"%]=\&(\|?{%` `+`.+=%;.#_}\&"^"-+%*).}%:##%}={~=~:.")&e&&s""`$''`"e	[reply] [d/l] [select]