Benchmarks:
s/iter limbic injun radiant ikegami radiant_i sauoq skeeve3 +sauoq_i skeeve3_i skeeve2 limbic 3.35 -- -1% -34% -47% -56% -71% -77% + -77% -81% -91% injun 3.33 1% -- -34% -46% -56% -71% -77% + -77% -81% -91% radiant 2.21 52% 51% -- -19% -33% -56% -65% + -66% -71% -87% ikegami 1.79 87% 86% 23% -- -18% -46% -57% + -58% -64% -84% radiant_i 1.47 128% 126% 50% 22% -- -34% -48% + -48% -56% -80% sauoq 0.969 246% 244% 128% 85% 52% -- -21% + -22% -34% -70% skeeve3 0.769 336% 333% 187% 133% 91% 26% -- + -1% -17% -62% sauoq_i 0.759 342% 339% 191% 136% 94% 28% 1% + -- -16% -61% skeeve3_i 0.641 423% 420% 244% 180% 130% 51% 20% + 19% -- -54% skeeve2 0.294 1042% 1035% 651% 511% 401% 230% 162% + 159% 118% --
Benchmarks code:
use strict; use warnings; use Benchmark qw( cmpthese ); # Use constant to provide comparable results. use constant GROUP_LENGTH => 3; # Generate a random string all tests will use. my $seq = join '', map { (qw[ A C G T ])[rand(4)] } 1..600_000; # --- sub sauoq { my %count; my $length = length $seq; for my $i (0..$length-1) { ++$count{substr($seq, $i, 1)}; ++$count{substr($seq, $i, 2)} if $i < $length - 1; ++$count{substr($seq, $i, 3)} if $i < $length - 2; } return \%count; } sub sauoq_i { my %count; my $length = length($seq); for my $i (0..$length-3) { ++$count{substr($seq, $i, 1)}; ++$count{substr($seq, $i, 2)}; ++$count{substr($seq, $i, 3)}; } ++$count{substr($seq, -2, 2)}; ++$count{substr($seq, -2, 1)}; ++$count{substr($seq, -1, 1)}; return \%count; } sub limbic { my %count; my $template = ('AXA2X2A3X2' x (length($seq) - 2)) . 'AXA2XA'; ++$count{$_} foreach unpack $template, $seq; return \%count; } sub injun { my %count; my @bases = split //, $seq; for my $i (0..$#bases) { ++$count{$bases[$i] }; ++$count{$bases[$i].$bases[$i+1] } if defined $bases +[$i+1]; ++$count{$bases[$i].$bases[$i+1].$bases[$i+2]} if defined $bases +[$i+2]; } return \%count; } sub ikegami { my %count; ++$count{$1} while $seq =~ /(?=(.))/g; ++$count{$1} while $seq =~ /(?=(..))/g; ++$count{$1} while $seq =~ /(?=(...))/g; return \%count; } sub skeeve2 { my %count; for my $i (0..length($seq)-GROUP_LENGTH()) { ++$count{substr($seq, $i, GROUP_LENGTH)}; } my @keys = keys %count; foreach my $key (@keys) { for my $i (1..GROUP_LENGTH-1) { $count{substr($key, 0, $i)} += $count{$key}; } } for my $i (1..GROUP_LENGTH-1) { for my $j ($i..GROUP_LENGTH-1) { ++$count{substr($seq, -$j, $i)}; } } return \%count; } sub skeeve3 { my %count; ++$count{"$1$2"} while $seq =~ /(.)(?=(..))/g; my @keys = keys %count; foreach my $key (@keys) { $count{substr($key, 0, 1)} += $count{$key}; $count{substr($key, 0, 2)} += $count{$key}; } ++$count{substr($seq, -2, 2)}; ++$count{substr($seq, -2, 1)}; ++$count{substr($seq, -1, 1)}; return \%count; } sub skeeve3_i { my %count; $count{$1}++ while $seq =~ /(?=(...))/g; my @keys = keys %count; foreach my $key (@keys) { $count{substr($key, 0, 1)} += $count{$key}; $count{substr($key, 0, 2)} += $count{$key}; } ++$count{substr($seq, -2, 2)}; ++$count{substr($seq, -2, 1)}; ++$count{substr($seq, -1, 1)}; return \%count; } sub radiant { my %count; my @rolling; $#rolling = 2; for my $i (0..length($seq)-1) { my $char = substr($seq, $i, 1); shift @rolling; push @rolling, $char; $count{$rolling[2]}++; #one-char count next if not defined $rolling[1]; $count{join('',@rolling[1,2])}++; #two-char count next if not defined $rolling[0]; $count{join('',@rolling)}++; #three-char count } return \%count; } sub radiant_i { my %count; my @rolling; $#rolling = 2; for my $i (0..length($seq)-1) { my $char = substr($seq, $i, 1); shift @rolling; push @rolling, $char; next if $i < 2; $count{join('', @rolling)}++; } my @keys = keys %count; foreach my $key (@keys) { $count{substr($key, 0, 1)} += $count{$key}; $count{substr($key, 0, 2)} += $count{$key}; } ++$count{substr($seq, -2, 2)}; ++$count{substr($seq, -2, 1)}; ++$count{substr($seq, -1, 1)}; return \%count; } # --- sub hash_count { my ($count) = @_; my $total = 0; $total += $_ foreach values %$count; return join ',', "total=>$total", map { sprintf("%s=>%s", $_, $count->{$_}) } sort { length($a) <=> length($b) || $a cmp $b } keys %$count; } my @tests = qw( sauoq sauoq_i limbic injun ikegami skeeve2 skeeve3 skeeve3_i radiant radiant_i ); { my $ok = 1; my @results = map { hash_count((\&$_)->()) } @tests; for my $i (0..$#tests) { for my $j (0..$#tests) { # or start at $i+1 if ($results[$i] ne $results[$j]) { $ok = 0; print("$tests[$j] differs from $tests[$i].\n"); } } } if ($ok) { print("No differences encountered.\n"); print("\n"); } else { print("\n"); print("Differences encountered.\n"); print("\$seq: $seq\n"); print("$tests[$_]: $results[$_]\n") for (0..$#tests); die("\n"); } } cmpthese(5, { map { $_ => \&$_ } @tests });
I edited the solutions to use the same style and to work from memory. I removed any extra work which could be easily added to any solution (such as validation checks).
I thoroughly checked all the solutions. All of produce indentical results.
I only ran the tests for 5 iterations because it's slow, but I ran it many times. All results where statistically identical.
The solutions by BrowserUK, InjunJoel and particularly Limbic~Region use much more memory than the others. All solutions expect the data to be fully in memory.
BrowserUK's solution requires Perl 5.8+.
Update: Reran on 5.8 to time BrowserUK.
Update: Added Skeeve's solutions, now that they are complete.
Update: Added RadiantMatrix's solution.
Update: Added Skeeve's bugfixes. Thoroughly verified the results of every test. Removed BrowserUK's solution since it was buggy. Applied hv's microoptimizations
In reply to Re: Question about speeding a regexp count
by ikegami
in thread Question about speeding a regexp count
by Commander Salamander
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |