#!/usr/bin/perl
use strict;
use warnings;
use Benchmark;
use Data::Dumper;
my $DNAlength= 600000;
my $sequencelength= 3;
my $testfilename= 'DNA.tst';
my $bufsize= 1024;
# create testdata
open(my $testfile, ">$testfilename") or die "$testfilename: $!";
my $buffer = '';
for (my $i=$DNAlength; $i; --$i) {
$buffer.= qw(A C G T)[rand 4];
if (length($buffer) > $bufsize) {
print $testfile $buffer;
$buffer = '';
}
}
print $testfile $buffer;
close $testfile;
a1: {
# Start collecting data
my %DNA_sequence_count;
my $a1_0=new Benchmark;
open($testfile, $testfilename) or die "$testfilename: $!";
my $kept=0;
while (1) {
my $read_bytes = read($testfile, $buffer, $bufsize-$kept);
last unless $read_bytes;
my $end_buffer = $read_bytes-$sequencelength;
for (my $i=0; $i <= $end_buffer; ++$i) {
++$DNA_sequence_count{substr($buffer, $i, $sequencelength)};
}
$kept= $sequencelength-1;
$buffer= substr($buffer, $end_buffer+1, $kept);
}
close $testfile;
my $a1_n=new Benchmark;
print "the file code took:",timestr(timediff($a1_n, $a1_0)),"\n";
}
a2: {
# Start collecting data
my %DNA_sequence_count;
my $a2_0=new Benchmark;
open($testfile, $testfilename) or die "$testfilename: $!";
my $allbuffered= <$testfile>;
close $testfile;
my $end_buffer= length($allbuffered)-$sequencelength+1;
for (my $i=0; $i <= $end_buffer; ++$i) {
++$DNA_sequence_count{substr($allbuffered, $i, $sequencelength)};
}
my $a2_n=new Benchmark;
print "the scalar code took:",timestr(timediff($a2_n, $a2_0)),"\n";
}
a3: {
# Start collecting data
my %DNA_sequence_count;
my $a3_0=new Benchmark;
open($testfile, $testfilename) or die "$testfilename: $!";
my $allbuffered= <$testfile>;
close $testfile;
++$DNA_sequence_count{$1.$2} while $allbuffered=~ /(.)(?=(..))/g;
my $a3_n=new Benchmark;
print "the regex code took:",timestr(timediff($a3_n, $a3_0)),"\n";
}
####
my (@keys)= keys %DNA_sequence_count;
foreach my $key (@keys) {
for ($i=$sequencelength-1; $i; --$i) {
$DNA_sequence_count{substr($key, 0, $i}+= $DNA_sequence_count{$key};
}
}
# correction again thanks to ikegami!
for ($i=$sequencelength-1; $i; --$i) {
# second correction thanks to sauoq
for ($j=$i; $j<$sequencelength; ++$j) {
++$DNA_sequence_count{substr($allbuffered, -$j, $i)};
}
}
####
CATCAT
gives
CAT 2
ATC 1
TCA 1
Applying that algorithm you get
CA 2 (directly from CAT)
AT 2 (from ATC + AT at the end)
TC 1 (directly from TCA)
C 2
A 2 (from AT at the end)
T 2 (from TCA + last T)