in reply to Faster and more efficient way to read a file vertically
I put together a benchmark for most of the suggested solutions (or adaptations of them to get consistent results) and ran tests against an inline dataset of 50 lines with Test::More then with a 50,000 line file produced by this one-liner.
perl -E '
my @alpha = ( qw{ A C G T } ) x 5;
push @alpha, qw{ . . };
say join q{}, map { $alpha[ rand @alpha ] } 1 .. 50
for 1 .. 50000;' > spw1202693.txt
Here's the script.
use strict;
use warnings;
use Benchmark qw{ cmpthese };
use Test::More qw{ no_plan };
open my $inFH, q{<}, \ <<__EOD__ or die $!;
TATCGC.TGCCC.ATTAAGCCATACTTCAAGGATCCCCCCG.GA.GGGCA
GGAGTCGTC.ACACATCTTCACTAC.CATATCTTGCTACGGCCACTGACA
CGA.A.CAATTTTCGAATGGAGGGCGAATGCCGTTGTGCGCTGCGTGACG
TCT.AGCCT.CGAA.A.GCAGGCGTGGGGCGTACCACGGCTTGGCCTAT.
ATGCACTT.AA.CCCTCGTAT.CTCTACTCT.ACAACCTTGGGCAG.T.T
TAGGGC.CCGCG.TAAACTGCAGATAGTACTCCAAGAATCGCTCCGACCC
.CGTCTAAACAATTAGCGGAGGTCGTC.CTGCAA.CAATGATCTTAACAC
AG.AAGAAGT.CAAAGT.GTAGTGGCTGGGTACTTGATCA.TCAATTTCA
ACC.TTCCAG.AAATGAGGTCTC.AAAAG.AGTT.CCTG.GGCTGTGTAG
ACAT.A.TGTTAACACCACCTATAATAGAAGC.TTATATTCACC.TTAAC
GAATTGT.TTCAATGCCACATA.GCTGGGCCAC.GCCTTAGGCTATTT.A
AGCTTTACAGATGT.ACCAAA.CCAG.GACTAGATGGGGGGG.ATATGCC
AGAAAGCCC.TCCCTGCGAAT.TGGGGGAGTACT.ATAGACTA.GGCCAG
.CACTCCGG.CTAACACTACTTCCTGTAACAAA.CTGAAGAGA.CGTTTG
CAGTCCTGAGCGTGCTAC..GTTCT.CTGTCG.TTAGATGGGCGC..GTA
AGCAAGAAAACACGTACAGAAAAAGCCGACGGC.GGGGTTC.GTCAACC.
T.ACTTATCGACATTAGCATCTGG.TGCCGCCTTT.GAACCCATTACTCG
TAAACGA.CACCGCTTAGGGCTGACCTCCGAA.TTATCAGAGTACCGGGC
ACGGATATCCTA.ATAGA.TACCTATGTAGGAAG.TCGAACTC.TAAACT
ACCCCCGTCC.TATCCTC.CGA.ATTGCCCCCGGCCGTACTCCA.AAACC
T..TC.AGCCGGG.TTG.TGTGGATCAGAAGTGATTGAGACTGG.GCCTA
TCAACCTTGAAGTTGAAACTCCGAGAGT.CGCGGTCAACC.AGCCTGCGC
GATCAGTGAGGAGCTGT.C.AAAGC.TC.AG.GCC.G.GA.GTGAGATT.
G.C.AGTTTGCCACTT.CATCCAGAGTTCATCCAGCAGG.ATTGAAGTTA
GCGCTGCA.CTGT.CATTTTTTATTCCCACCCGGTCTCCCCAACCCCGAT
TGTATAGCCAGG.CGGAG.TTTCTT.TAGATCTAGTAAGACATT.CCCGA
AC.CCCCT.G.TAGCTAAATCGACGGG..GTCAATATACACGGAT.CTTT
AAGC.GGGCTGATGCTTATCTCCTAGCCCGC.CCTCGATGAT.ATTATTG
TCACCC.ACCGC..T.CGGAACGAAAACT..AT.CCTATTAGAACATCCT
.GCGTGAACC.TG.G.TACACCG.GATGGTCCCGAC.GGCTACCGAT.CA
.CGCTGCCCCCATCTCCAGCTATC.AAA.AGGGACGCGATATGCGGAAGC
CGTTCAGACAATCCTTTTGGGGTGTAAATGTCTCC.ACCTC.GAGA.CTG
AAGACATAGGAG.CCAGAAT.A.CGTCATACAGAGGCACTC.TAT..TCT
AGGCCTGCCTACTT.TTGGCTAA.C.AGACTTGG.AAG.ATGTAGAAC.G
GTCAACCCGTGCTAACTGGGGTGAGGAATCTTCCGAGCC.TGCTCGTCGC
TCG.TGGCAGCTC.AACTGGTGCGCGGCAG.CCTCCTGCCAAGTATTCAG
CGGGGGCTAC..GTT.TCATCGAACACGGCACACTAACAAACTCCTGTGT
TTCCGGGGTCGACCCTTTGGCCCAAGAGTGA.AGGGCTTCG.ACTGCG..
AA.CGTGCGGGTAGCCTAGACACGTAGC.TTGTGCGC.CG.CGCA.CAAG
AT.TATGTCA.ACTTCCGCCGGG.CTTCTGTGTACATT.AA.GAGAATAA
CGGCAAGGATTGCTCGACGTAGGAGTCCGTGGAGCTCGTGC.GACC.ACC
.GTTAA.CACGCCTTAACTTTTCGGAACAGAGTAAC.AATCCCGG.TA.C
CAGGTAATGTGTCACCAGGTTCGGGCCCT.CACCGTCCCAGCTAC.TGTT
TAGCCCCTCTTTT.GAT.GGCCC.AGCGACATCAA.TGATC..CTGTAGG
CGATCATATTTCATTGTTCCGC.TG.AGCGGT.A.TG.GCAAT.CAGCCG
ATCGATGTTCTGATATG..GTGTGAATAC.AGAAACCGGCTTTGTCGGGG
CCTTT.AGGAA.AC.TAGGT.TT.CTCAATGAAC.GACATCAAC.T.AGC
AAGGAGGTACACAGCGTTCAGCGGATCC.CT.AAGC.TAGCATCTGCTGA
GCAGATA.A.TCTGCTTCAACTGTGAAAGGGTTG.CTAATCAG.GCGTAG
GGTACGATC.GCAAC.AGTCCACAAGTACACGGTGGAATT.CC..C.TTG
__EOD__
my $fileStart = tell $inFH;
my $offset = 9; # Column 10 if numbering from 1
my $testOK = q{C.T.AGCTGTTG..GAGACCCAGGCAGTCCCAGTTGCCGATCTTTCAC..};
my %methods = (
rsubstr => sub { # BrowserUk's method
seek $inFH, 0, 0;
my $buffer;
my $rsWanted = \ substr $buffer, $offset, 1;
my $retStr;
while ( $buffer = <$inFH> )
{
$retStr .= ${ $rsWanted };
}
return \ $retStr;
},
brutish => sub { # Adapted from Anonymonk's brute force method
seek $inFH, 0, 0;
my $retStr;
while ( <$inFH> )
{
my @split = split m{};
$retStr .= $split[ $offset ];
}
return \ $retStr;
},
seek => sub { # Suggested by vr
seek $inFH, 0, 0;
my $retStr;
my $char;
my $len = length( <$inFH> ) - 1;
seek $inFH, $offset, 0;
while ( read $inFH, $char, 1 )
{
$retStr .= $char;
seek $inFH, $len, 1
}
return \ $retStr;
},
substr => sub { # Suggested by pryrt and Laurent_R
seek $inFH, 0, 0;
my $retStr;
while ( <$inFH> )
{
$retStr .= substr $_, $offset, 1;
}
return \ $retStr;
},
regex => sub { # Another of pryrt's solutions
seek $inFH, 0, 0;
my $retStr;
while ( <$inFH> )
{
$retStr .= $1 if m{^.{$offset}(.)};
}
return \ $retStr;
},
unpack => sub { # Suggested but not implemented by LanX & pryrt
seek $inFH, 0, 0;
my $retStr;
my $fmt = qq{x${offset}a};
while ( <$inFH> )
{
$retStr .= unpack $fmt, $_;
}
return \ $retStr;
},
unpackM => sub { # Multi-line unpack suggested by LanX
seek $inFH, 0, 0;
my $buffer = <$inFH>;
my $lineLen = length $buffer;
my $nLines = 500;
my $chunkSize = $lineLen * $nLines;
seek $inFH, 0, 0;
my $retStr;
my $fmt = qq{(x${offset}ax@{ [ $lineLen - $offset - 1 ] })*};
while ( my $bytesRead = read $inFH, $buffer, $chunkSize )
{
$retStr .= join q{}, unpack $fmt, $buffer;
}
return \ $retStr;
},
split => sub { # Simple split from thanos1983
seek $inFH, 0, 0;
my $retStr;
my $fmt = qq{x${offset}a};
while ( <$inFH> )
{
$retStr .= ( split m{} )[ $offset ];
}
return \ $retStr;
},
pushAoA => sub { # In memory solution from Discipulus
seek $inFH, 0, 0;
my @aoa;
while ( <$inFH> )
{
chomp;
push @aoa, [ split m{} ];
}
my $retStr = join q{}, map { $aoa[ $_ ]->[ $offset ] } 0 .. $#a
+oa;
return \ $retStr;
},
ANDmask => sub { # Multi-line AND mask by johngg
seek $inFH, 0, 0;
my $buffer = <$inFH>;
my $lineLen = length $buffer;
my $nLines = 500;
my $chunkSize = $lineLen * $nLines;
seek $inFH, 0, 0;
my $retStr;
my $mask
= qq{\x00} x ${offset}
. qq{\xff}
. qq{\x00} x ( $lineLen - $offset - 1 );
$mask x= $nLines;
while ( my $bytesRead = read $inFH, $buffer, $chunkSize )
{
( my $anded = $buffer & $mask ) =~ tr{\x00}{}d;
$retStr .= $anded;
}
return \ $retStr;
},
);
foreach my $method ( sort keys %methods )
{
ok( ${ $methods{ $method }->() } eq $testOK, $method );
}
close $inFH or die $!;
my $filename = q{spw1202693.txt};
open $inFH, q{<}, $filename
or die qq{open: < $filename: $!\n};
cmpthese(
-3,
{
map
{
my $codeStr
= q[sub { my $col = $methods{ ]
. $_
. q[ }->(); }];
$_ => eval $codeStr;
}
keys %methods
}
);
close $inFH
or die qq{close: < $filename: $!\n};
And the results.
ok 1 - ANDmask
ok 2 - brutish
ok 3 - pushAoA
ok 4 - regex
ok 5 - rsubstr
ok 6 - seek
ok 7 - split
ok 8 - substr
ok 9 - unpack
ok 10 - unpackM
Rate pushAoA brutish split seek regex unpack substr rsubstr
+ unpackM ANDmask
pushAoA 1.11/s -- -35% -61% -62% -91% -97% -98% -98%
+ -98% -99%
brutish 1.71/s 55% -- -39% -41% -86% -95% -96% -96%
+ -97% -98%
split 2.82/s 155% 65% -- -3% -77% -92% -94% -94%
+ -95% -97%
seek 2.91/s 163% 70% 3% -- -76% -92% -94% -94%
+ -95% -97%
regex 12.3/s 1010% 617% 336% 322% -- -65% -74% -75%
+ -79% -88%
unpack 35.0/s 3060% 1943% 1141% 1102% 185% -- -25% -27%
+ -40% -67%
substr 46.9/s 4137% 2638% 1564% 1512% 282% 34% -- -3%
+ -20% -55%
rsubstr 48.2/s 4254% 2714% 1610% 1556% 292% 38% 3% --
+ -18% -54%
unpackM 58.7/s 5194% 3321% 1979% 1914% 377% 68% 25% 22%
+ -- -44%
ANDmask 105/s 9407% 6045% 3634% 3517% 757% 201% 124% 118%
+ 80% --
1..10
The two substr solutions are neck and neck in the lead, unpack a distant third and everything else well behind. However, I have cocked up benchmarks before so take this with a pinch of salt!
Update: Corrected attribution of the "unpack" method and incorporated the two new methods and benchmark results from this post. Working with multi-line buffers using unpack or a mask to AND with the buffer seems to be the fastest approach.
Re^2: Faster and more efficient way to read a file vertically
by vr (Curate) on Nov 05, 2017 at 17:22 UTC
|
Interesting. I had similar partial synthetic benchmark yesterday, thought to publish it mainly to advice against my "seek" solution as too slow, then decided not to :), because maybe it's not worth readers' effort.
Nevertheless, somewhat different results for a 1 million lines file, and fast NVMe SSD storage. Below is the case for returning a hash with chars counts, but it's similar for returning string.
$ perl vert2.pl
ok 1 - same results
ok 2 - same results
ok 3 - same results
(warning: too few iterations for a reliable count)
(warning: too few iterations for a reliable count)
(warning: too few iterations for a reliable count)
(warning: too few iterations for a reliable count)
Rate seek buk substr slurp
seek 0.920/s -- -61% -84% -88%
buk 2.36/s 157% -- -58% -69%
substr 5.66/s 515% 140% -- -26%
slurp 7.69/s 736% 226% 36% --
1..3
| [reply] [Watch: Dir/Any] [d/l] [select] |
|
The following provides a parallel version for the slurp routine. I'm not sure why or where to look, running MCE via cmpthese reports inaccurately with MCE being 300x faster which is wrong. So, I needed to benchmark another way.
Regarding MCE, workers receive the next chunk and tally using a local hash. Then, update the shared hash.
use strict;
use warnings;
use MCE;
use MCE::Shared;
use String::Random 'random_regex';
use Time::HiRes 'time';
my $fn = 'dna.txt';
my $POS = 10;
my $shrcount = MCE::Shared->hash();
my $mce;
unless ( -e $fn ) {
open my $fh, '>', $fn;
print $fh random_regex( '[ACTG]{42}' ), "\n"
for 1 .. 1e6;
}
sub slurp {
open my $fh, '<', $fn;
my $s = do { local $/ = undef; <$fh> };
my $count;
$count-> { substr $s, $POS - 1 + 43 * $_, 1 }++
for 0 .. length( $s ) / 43 - 1;
return $count
}
sub mce {
unless ( defined $mce ) {
$mce = MCE->new(
max_workers => 4,
chunk_size => '300k',
use_slurpio => 1,
user_func => sub {
my ( $mce, $slurp_ref, $chunk_id ) = @_;
my ( $count, @todo );
$count-> { substr ${ $slurp_ref }, $POS - 1 + 43 * $_, 1 }++
for 0 .. length( ${ $slurp_ref } ) / 43 - 1;
# Each key involves one IPC trip to the shared-manager.
#
# $shrcount->incrby( $_, $count->{$_} )
# for ( keys %{ $count } );
# The following is faster for smaller chunk size.
# Basically, send multiple commands at once.
#
push @todo, [ "incrby", $_, $count->{$_} ]
for ( keys %{ $count } );
$shrcount->pipeline( @todo );
}
)->spawn();
}
$shrcount->clear();
$mce->process($fn);
return $shrcount->export();
}
for (qw/ slurp mce /) {
no strict 'refs';
my $start = time();
my $func = "main::$_";
$func->() for 1 .. 3;
printf "%5s: %0.03f secs.\n", $_, time() - $start;
}
__END__
slurp: 0.487 secs.
mce: 0.149 secs.
| [reply] [Watch: Dir/Any] [d/l] |
Re^2: Faster and more efficient way to read a file vertically
by LanX (Saint) on Nov 05, 2017 at 21:48 UTC
|
> unpack => sub { # Suggested but not implemented by pryrt
Actually unpack was suggested (and not implemented) by me first. ;)
FWIW: My idea was to unpack multiple lines simultaneously instead of going line by line.
If you are interested and all lines really have the same length (the OP never clarified)
- read a chunk of complete lines bigger 4 or 8kb (depending on the blocksize of the OS to optimize read operations)
- run a repeated unpack pattern
- get a list of 1 result for each chunk line
Please see if substr on single lines is still faster then.
$line_length += $newline_length; # OS dependend
$line_count = int(8 * 1024 / $line_length) +1;
$chunk_size = $line_count * line_length;
And yes I'm still reluctant to implement it, smells too much like an XY Problem :)
update
In hindsight... probably having a slightly smaller chunk is more efficient :
$line_count = int(8 * 1024 / $line_length) | [reply] [Watch: Dir/Any] [d/l] [select] |
|
| [reply] [Watch: Dir/Any] |
|
use strict;
use warnings;
my $line_count = 10;
my $line = join ("", 'a'..'z',"A".."Z") . "\n";
my $file= "$line" x $line_count;
my $offset = 2;
my $rest = length($line) - $offset -1;
my $fmt = qq{(x${offset}ax${rest})${line_count}};
my @results = unpack $fmt, $file;
print @results;
Will print cccccccccc because of offset 2
| [reply] [Watch: Dir/Any] [d/l] [select] |
|
|
|
|
|