Interesting. I had similar partial synthetic benchmark yesterday, thought to publish it mainly to advice against my "seek" solution as too slow, then decided not to :), because maybe it's not worth readers' effort.
Nevertheless, somewhat different results for a 1 million lines file, and fast NVMe SSD storage. Below is the case for returning a hash with chars counts, but it's similar for returning string.
use strict;
use warnings;
use feature 'say';
use String::Random 'random_regex';
use Benchmark 'cmpthese';
use Test::More 'no_plan';
my $fn = 'dna.txt';
my $POS = 10;
unless ( -e $fn ) {
open my $fh, '>', $fn;
print $fh random_regex( '[ACTG]{42}' ), "\n"
for 1 .. 1e6;
}
is_deeply _seek(), _substr(), 'same results';
is_deeply slurp(), _substr(), 'same results';
is_deeply buk(), _substr(), 'same results';
cmpthese( 3, {
substr => \&_substr,
seek => \&_seek,
buk => \&buk,
slurp => \&slurp,
});
sub slurp {
open my $fh, '<', $fn;
my $s = do { local $/ = undef; <$fh> };
my $count;
$count-> { substr $s, $POS - 1 + 43 * $_, 1 }++
for 0 .. length( $s ) / 43 - 1;
return $count
}
sub buk {
open my $fh, '<', $fn;
my $buf = chr( 0 ) x 43;
my $ref = \substr( $buf, $POS - 1, 1 );
my $count;
until ( eof $fh ) {
substr( $buf, 0 ) = <$fh>;
$count-> { $$ref }++
}
return $count
}
sub _seek {
open my $fh, '<', $fn;
my $L = length( <$fh> ) - 1;
seek $fh, $POS - 1, 0;
my $count;
until ( eof $fh ) {
$count-> { getc $fh }++;
seek $fh, $L, 1
}
return $count
}
sub _substr {
open my $fh, '<', $fn;
my $count;
$count-> { substr $_, $POS - 1, 1 }++
while <$fh>;
return $count
}
$ perl -v
This is perl 5, version 26, subversion 0 (v5.26.0) built for x86_64-li
+nux-thread-multi
(with 1 registered patch, see perl -V for more detail)