perl -E '
my @alpha = ( qw{ A C G T } ) x 5;
push @alpha, qw{ . . };
say join q{}, map { $alpha[ rand @alpha ] } 1 .. 50
for 1 .. 50000;' > spw1202693.txt
####
use strict;
use warnings;
use Benchmark qw{ cmpthese };
use Test::More qw{ no_plan };
open my $inFH, q{<}, \ <<__EOD__ or die $!;
TATCGC.TGCCC.ATTAAGCCATACTTCAAGGATCCCCCCG.GA.GGGCA
GGAGTCGTC.ACACATCTTCACTAC.CATATCTTGCTACGGCCACTGACA
CGA.A.CAATTTTCGAATGGAGGGCGAATGCCGTTGTGCGCTGCGTGACG
TCT.AGCCT.CGAA.A.GCAGGCGTGGGGCGTACCACGGCTTGGCCTAT.
ATGCACTT.AA.CCCTCGTAT.CTCTACTCT.ACAACCTTGGGCAG.T.T
TAGGGC.CCGCG.TAAACTGCAGATAGTACTCCAAGAATCGCTCCGACCC
.CGTCTAAACAATTAGCGGAGGTCGTC.CTGCAA.CAATGATCTTAACAC
AG.AAGAAGT.CAAAGT.GTAGTGGCTGGGTACTTGATCA.TCAATTTCA
ACC.TTCCAG.AAATGAGGTCTC.AAAAG.AGTT.CCTG.GGCTGTGTAG
ACAT.A.TGTTAACACCACCTATAATAGAAGC.TTATATTCACC.TTAAC
GAATTGT.TTCAATGCCACATA.GCTGGGCCAC.GCCTTAGGCTATTT.A
AGCTTTACAGATGT.ACCAAA.CCAG.GACTAGATGGGGGGG.ATATGCC
AGAAAGCCC.TCCCTGCGAAT.TGGGGGAGTACT.ATAGACTA.GGCCAG
.CACTCCGG.CTAACACTACTTCCTGTAACAAA.CTGAAGAGA.CGTTTG
CAGTCCTGAGCGTGCTAC..GTTCT.CTGTCG.TTAGATGGGCGC..GTA
AGCAAGAAAACACGTACAGAAAAAGCCGACGGC.GGGGTTC.GTCAACC.
T.ACTTATCGACATTAGCATCTGG.TGCCGCCTTT.GAACCCATTACTCG
TAAACGA.CACCGCTTAGGGCTGACCTCCGAA.TTATCAGAGTACCGGGC
ACGGATATCCTA.ATAGA.TACCTATGTAGGAAG.TCGAACTC.TAAACT
ACCCCCGTCC.TATCCTC.CGA.ATTGCCCCCGGCCGTACTCCA.AAACC
T..TC.AGCCGGG.TTG.TGTGGATCAGAAGTGATTGAGACTGG.GCCTA
TCAACCTTGAAGTTGAAACTCCGAGAGT.CGCGGTCAACC.AGCCTGCGC
GATCAGTGAGGAGCTGT.C.AAAGC.TC.AG.GCC.G.GA.GTGAGATT.
G.C.AGTTTGCCACTT.CATCCAGAGTTCATCCAGCAGG.ATTGAAGTTA
GCGCTGCA.CTGT.CATTTTTTATTCCCACCCGGTCTCCCCAACCCCGAT
TGTATAGCCAGG.CGGAG.TTTCTT.TAGATCTAGTAAGACATT.CCCGA
AC.CCCCT.G.TAGCTAAATCGACGGG..GTCAATATACACGGAT.CTTT
AAGC.GGGCTGATGCTTATCTCCTAGCCCGC.CCTCGATGAT.ATTATTG
TCACCC.ACCGC..T.CGGAACGAAAACT..AT.CCTATTAGAACATCCT
.GCGTGAACC.TG.G.TACACCG.GATGGTCCCGAC.GGCTACCGAT.CA
.CGCTGCCCCCATCTCCAGCTATC.AAA.AGGGACGCGATATGCGGAAGC
CGTTCAGACAATCCTTTTGGGGTGTAAATGTCTCC.ACCTC.GAGA.CTG
AAGACATAGGAG.CCAGAAT.A.CGTCATACAGAGGCACTC.TAT..TCT
AGGCCTGCCTACTT.TTGGCTAA.C.AGACTTGG.AAG.ATGTAGAAC.G
GTCAACCCGTGCTAACTGGGGTGAGGAATCTTCCGAGCC.TGCTCGTCGC
TCG.TGGCAGCTC.AACTGGTGCGCGGCAG.CCTCCTGCCAAGTATTCAG
CGGGGGCTAC..GTT.TCATCGAACACGGCACACTAACAAACTCCTGTGT
TTCCGGGGTCGACCCTTTGGCCCAAGAGTGA.AGGGCTTCG.ACTGCG..
AA.CGTGCGGGTAGCCTAGACACGTAGC.TTGTGCGC.CG.CGCA.CAAG
AT.TATGTCA.ACTTCCGCCGGG.CTTCTGTGTACATT.AA.GAGAATAA
CGGCAAGGATTGCTCGACGTAGGAGTCCGTGGAGCTCGTGC.GACC.ACC
.GTTAA.CACGCCTTAACTTTTCGGAACAGAGTAAC.AATCCCGG.TA.C
CAGGTAATGTGTCACCAGGTTCGGGCCCT.CACCGTCCCAGCTAC.TGTT
TAGCCCCTCTTTT.GAT.GGCCC.AGCGACATCAA.TGATC..CTGTAGG
CGATCATATTTCATTGTTCCGC.TG.AGCGGT.A.TG.GCAAT.CAGCCG
ATCGATGTTCTGATATG..GTGTGAATAC.AGAAACCGGCTTTGTCGGGG
CCTTT.AGGAA.AC.TAGGT.TT.CTCAATGAAC.GACATCAAC.T.AGC
AAGGAGGTACACAGCGTTCAGCGGATCC.CT.AAGC.TAGCATCTGCTGA
GCAGATA.A.TCTGCTTCAACTGTGAAAGGGTTG.CTAATCAG.GCGTAG
GGTACGATC.GCAAC.AGTCCACAAGTACACGGTGGAATT.CC..C.TTG
__EOD__
my $fileStart = tell $inFH;
my $offset = 9; # Column 10 if numbering from 1
my $testOK = q{C.T.AGCTGTTG..GAGACCCAGGCAGTCCCAGTTGCCGATCTTTCAC..};
my %methods = (
rsubstr => sub { # BrowserUk's method
seek $inFH, 0, 0;
my $buffer;
my $rsWanted = \ substr $buffer, $offset, 1;
my $retStr;
while ( $buffer = <$inFH> )
{
$retStr .= ${ $rsWanted };
}
return \ $retStr;
},
brutish => sub { # Adapted from Anonymonk's brute force method
seek $inFH, 0, 0;
my $retStr;
while ( <$inFH> )
{
my @split = split m{};
$retStr .= $split[ $offset ];
}
return \ $retStr;
},
seek => sub { # Suggested by vr
seek $inFH, 0, 0;
my $retStr;
my $char;
my $len = length( <$inFH> ) - 1;
seek $inFH, $offset, 0;
while ( read $inFH, $char, 1 )
{
$retStr .= $char;
seek $inFH, $len, 1
}
return \ $retStr;
},
substr => sub { # Suggested by pryrt and Laurent_R
seek $inFH, 0, 0;
my $retStr;
while ( <$inFH> )
{
$retStr .= substr $_, $offset, 1;
}
return \ $retStr;
},
regex => sub { # Another of pryrt's solutions
seek $inFH, 0, 0;
my $retStr;
while ( <$inFH> )
{
$retStr .= $1 if m{^.{$offset}(.)};
}
return \ $retStr;
},
unpack => sub { # Suggested but not implemented by LanX & pryrt
seek $inFH, 0, 0;
my $retStr;
my $fmt = qq{x${offset}a};
while ( <$inFH> )
{
$retStr .= unpack $fmt, $_;
}
return \ $retStr;
},
unpackM => sub { # Multi-line unpack suggested by LanX
seek $inFH, 0, 0;
my $buffer = <$inFH>;
my $lineLen = length $buffer;
my $nLines = 500;
my $chunkSize = $lineLen * $nLines;
seek $inFH, 0, 0;
my $retStr;
my $fmt = qq{(x${offset}ax@{ [ $lineLen - $offset - 1 ] })*};
while ( my $bytesRead = read $inFH, $buffer, $chunkSize )
{
$retStr .= join q{}, unpack $fmt, $buffer;
}
return \ $retStr;
},
split => sub { # Simple split from thanos1983
seek $inFH, 0, 0;
my $retStr;
my $fmt = qq{x${offset}a};
while ( <$inFH> )
{
$retStr .= ( split m{} )[ $offset ];
}
return \ $retStr;
},
pushAoA => sub { # In memory solution from Discipulus
seek $inFH, 0, 0;
my @aoa;
while ( <$inFH> )
{
chomp;
push @aoa, [ split m{} ];
}
my $retStr = join q{}, map { $aoa[ $_ ]->[ $offset ] } 0 .. $#aoa;
return \ $retStr;
},
ANDmask => sub { # Multi-line AND mask by johngg
seek $inFH, 0, 0;
my $buffer = <$inFH>;
my $lineLen = length $buffer;
my $nLines = 500;
my $chunkSize = $lineLen * $nLines;
seek $inFH, 0, 0;
my $retStr;
my $mask
= qq{\x00} x ${offset}
. qq{\xff}
. qq{\x00} x ( $lineLen - $offset - 1 );
$mask x= $nLines;
while ( my $bytesRead = read $inFH, $buffer, $chunkSize )
{
( my $anded = $buffer & $mask ) =~ tr{\x00}{}d;
$retStr .= $anded;
}
return \ $retStr;
},
);
foreach my $method ( sort keys %methods )
{
ok( ${ $methods{ $method }->() } eq $testOK, $method );
}
close $inFH or die $!;
my $filename = q{spw1202693.txt};
open $inFH, q{<}, $filename
or die qq{open: < $filename: $!\n};
cmpthese(
-3,
{
map
{
my $codeStr
= q[sub { my $col = $methods{ ]
. $_
. q[ }->(); }];
$_ => eval $codeStr;
}
keys %methods
}
);
close $inFH
or die qq{close: < $filename: $!\n};
##
##
ok 1 - ANDmask
ok 2 - brutish
ok 3 - pushAoA
ok 4 - regex
ok 5 - rsubstr
ok 6 - seek
ok 7 - split
ok 8 - substr
ok 9 - unpack
ok 10 - unpackM
Rate pushAoA brutish split seek regex unpack substr rsubstr unpackM ANDmask
pushAoA 1.11/s -- -35% -61% -62% -91% -97% -98% -98% -98% -99%
brutish 1.71/s 55% -- -39% -41% -86% -95% -96% -96% -97% -98%
split 2.82/s 155% 65% -- -3% -77% -92% -94% -94% -95% -97%
seek 2.91/s 163% 70% 3% -- -76% -92% -94% -94% -95% -97%
regex 12.3/s 1010% 617% 336% 322% -- -65% -74% -75% -79% -88%
unpack 35.0/s 3060% 1943% 1141% 1102% 185% -- -25% -27% -40% -67%
substr 46.9/s 4137% 2638% 1564% 1512% 282% 34% -- -3% -20% -55%
rsubstr 48.2/s 4254% 2714% 1610% 1556% 292% 38% 3% -- -18% -54%
unpackM 58.7/s 5194% 3321% 1979% 1914% 377% 68% 25% 22% -- -44%
ANDmask 105/s 9407% 6045% 3634% 3517% 757% 201% 124% 118% 80% --
1..10