comment on

Benchmarks:

          s/iter limbic injun radiant ikegami radiant_i sauoq skeeve3 
+sauoq_i skeeve3_i skeeve2
limbic      3.35     --   -1%    -34%    -47%      -56%  -71%    -77% 
+   -77%      -81%    -91%
injun       3.33     1%    --    -34%    -46%      -56%  -71%    -77% 
+   -77%      -81%    -91%
radiant     2.21    52%   51%      --    -19%      -33%  -56%    -65% 
+   -66%      -71%    -87%
ikegami     1.79    87%   86%     23%      --      -18%  -46%    -57% 
+   -58%      -64%    -84%
radiant_i   1.47   128%  126%     50%     22%        --  -34%    -48% 
+   -48%      -56%    -80%
sauoq      0.969   246%  244%    128%     85%       52%    --    -21% 
+   -22%      -34%    -70%
skeeve3    0.769   336%  333%    187%    133%       91%   26%      -- 
+    -1%      -17%    -62%
sauoq_i    0.759   342%  339%    191%    136%       94%   28%      1% 
+     --      -16%    -61%
skeeve3_i  0.641   423%  420%    244%    180%      130%   51%     20% 
+    19%        --    -54%
skeeve2    0.294  1042% 1035%    651%    511%      401%  230%    162% 
+   159%      118%      --
[download]

Benchmarks code:

use strict;
use warnings;

use Benchmark qw( cmpthese );


# Use constant to provide comparable results.
use constant GROUP_LENGTH => 3;


# Generate a random string all tests will use.
my $seq = join '',
          map { (qw[ A C G T ])[rand(4)] }
          1..600_000;


# ---


sub sauoq {
   my %count;

   my $length = length $seq;
   for my $i (0..$length-1) {
      ++$count{substr($seq, $i, 1)};
      ++$count{substr($seq, $i, 2)} if $i < $length - 1;
      ++$count{substr($seq, $i, 3)} if $i < $length - 2;
   }

   return \%count;
}


sub sauoq_i {
   my %count;

   my $length = length($seq);
   for my $i (0..$length-3) {
      ++$count{substr($seq, $i, 1)};
      ++$count{substr($seq, $i, 2)};
      ++$count{substr($seq, $i, 3)};
   }

   ++$count{substr($seq, -2, 2)};
   ++$count{substr($seq, -2, 1)};
   ++$count{substr($seq, -1, 1)};

   return \%count;
}


sub limbic {
   my %count;
   my $template = ('AXA2X2A3X2' x (length($seq) - 2)) . 'AXA2XA';
   ++$count{$_} foreach unpack $template, $seq;
   return \%count;
}


sub injun {
   my %count;

   my @bases = split //, $seq;

   for my $i (0..$#bases) {
      ++$count{$bases[$i]                          };
      ++$count{$bases[$i].$bases[$i+1]             } if defined $bases
+[$i+1];
      ++$count{$bases[$i].$bases[$i+1].$bases[$i+2]} if defined $bases
+[$i+2];
   }

   return \%count;
}


sub ikegami {
   my %count;
   ++$count{$1} while $seq =~ /(?=(.))/g;
   ++$count{$1} while $seq =~ /(?=(..))/g;
   ++$count{$1} while $seq =~ /(?=(...))/g;
   return \%count;
}


sub skeeve2 {
   my %count;

   for my $i (0..length($seq)-GROUP_LENGTH()) {
      ++$count{substr($seq, $i, GROUP_LENGTH)};
   }

   my @keys = keys %count;
   foreach my $key (@keys) {
      for my $i (1..GROUP_LENGTH-1) {
         $count{substr($key, 0, $i)} += $count{$key};
      }
   }

   for my $i (1..GROUP_LENGTH-1) {
      for my $j ($i..GROUP_LENGTH-1) {
         ++$count{substr($seq, -$j, $i)};
      }
   }

   return \%count;
}


sub skeeve3 {
   my %count;

   ++$count{"$1$2"} while $seq =~ /(.)(?=(..))/g;

   my @keys = keys %count;
   foreach my $key (@keys) {
      $count{substr($key, 0, 1)} += $count{$key};
      $count{substr($key, 0, 2)} += $count{$key};
   }

   ++$count{substr($seq, -2, 2)};
   ++$count{substr($seq, -2, 1)};
   ++$count{substr($seq, -1, 1)};

   return \%count;
}


sub skeeve3_i {
   my %count;

   $count{$1}++ while $seq =~ /(?=(...))/g;

   my @keys = keys %count;
   foreach my $key (@keys) {
      $count{substr($key, 0, 1)} += $count{$key};
      $count{substr($key, 0, 2)} += $count{$key};
   }

   ++$count{substr($seq, -2, 2)};
   ++$count{substr($seq, -2, 1)};
   ++$count{substr($seq, -1, 1)};

   return \%count;
}


sub radiant {
   my %count;

   my @rolling;
   $#rolling = 2;

   for my $i (0..length($seq)-1) {
      my $char = substr($seq, $i, 1);

      shift @rolling;
      push @rolling, $char;

      $count{$rolling[2]}++;            #one-char count

      next if not defined $rolling[1];
      $count{join('',@rolling[1,2])}++; #two-char count

      next if not defined $rolling[0]; 
      $count{join('',@rolling)}++;      #three-char count
   }

   return \%count;
}


sub radiant_i {
   my %count;

   my @rolling;
   $#rolling = 2;

   for my $i (0..length($seq)-1) {
      my $char = substr($seq, $i, 1);

      shift @rolling;
      push @rolling, $char;

      next if $i < 2;
      $count{join('', @rolling)}++;
   }

   my @keys = keys %count;
   foreach my $key (@keys) {
      $count{substr($key, 0, 1)} += $count{$key};
      $count{substr($key, 0, 2)} += $count{$key};
   }

   ++$count{substr($seq, -2, 2)};
   ++$count{substr($seq, -2, 1)};
   ++$count{substr($seq, -1, 1)};

   return \%count;
}


# ---


sub hash_count {
   my ($count) = @_;

   my $total = 0;
   $total += $_ foreach values %$count;

   return join ',',
          "total=>$total", 
          map { sprintf("%s=>%s", $_, $count->{$_}) }
          sort { length($a) <=> length($b) || $a cmp $b }
          keys %$count;
}


my @tests = qw(
   sauoq
   sauoq_i
   limbic
   injun
   ikegami
   skeeve2
   skeeve3
   skeeve3_i
   radiant
   radiant_i
);


{
   my $ok = 1;

   my @results = map { hash_count((\&$_)->()) } @tests;

   for my $i (0..$#tests) {
      for my $j (0..$#tests) {                # or start at $i+1
         if ($results[$i] ne $results[$j]) {
            $ok = 0;
            print("$tests[$j] differs from $tests[$i].\n");
         }
      }
   }

   if ($ok) {
      print("No differences encountered.\n");
      print("\n");
   } else {
      print("\n");
      print("Differences encountered.\n");
      print("\$seq: $seq\n");
      print("$tests[$_]: $results[$_]\n") for (0..$#tests);
      die("\n");
   }
}


cmpthese(5, { map { $_ => \&$_ } @tests });
[download]

I edited the solutions to use the same style and to work from memory. I removed any extra work which could be easily added to any solution (such as validation checks).

I thoroughly checked all the solutions. All of produce indentical results.

I only ran the tests for 5 iterations because it's slow, but I ran it many times. All results where statistically identical.

The solutions by BrowserUK, InjunJoel and particularly Limbic~Region use much more memory than the others. All solutions expect the data to be fully in memory.

BrowserUK's solution requires Perl 5.8+.

Update: Reran on 5.8 to time BrowserUK.

Update: Added Skeeve's solutions, now that they are complete.

Update: Added RadiantMatrix's solution.

Update: Added Skeeve's bugfixes. Thoroughly verified the results of every test. Removed BrowserUK's solution since it was buggy. Applied hv's microoptimizations

In reply to Re: Question about speeding a regexp count by ikegami
in thread Question about speeding a regexp count by Commander Salamander

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.