perl -E ' my @alpha = ( qw{ A C G T } ) x 5; push @alpha, qw{ . . }; say join q{}, map { $alpha[ rand @alpha ] } 1 .. 50 for 1 .. 50000;' > spw1202693.txt #### use strict; use warnings; use Benchmark qw{ cmpthese }; use Test::More qw{ no_plan }; open my $inFH, q{<}, \ <<__EOD__ or die $!; TATCGC.TGCCC.ATTAAGCCATACTTCAAGGATCCCCCCG.GA.GGGCA GGAGTCGTC.ACACATCTTCACTAC.CATATCTTGCTACGGCCACTGACA CGA.A.CAATTTTCGAATGGAGGGCGAATGCCGTTGTGCGCTGCGTGACG TCT.AGCCT.CGAA.A.GCAGGCGTGGGGCGTACCACGGCTTGGCCTAT. ATGCACTT.AA.CCCTCGTAT.CTCTACTCT.ACAACCTTGGGCAG.T.T TAGGGC.CCGCG.TAAACTGCAGATAGTACTCCAAGAATCGCTCCGACCC .CGTCTAAACAATTAGCGGAGGTCGTC.CTGCAA.CAATGATCTTAACAC AG.AAGAAGT.CAAAGT.GTAGTGGCTGGGTACTTGATCA.TCAATTTCA ACC.TTCCAG.AAATGAGGTCTC.AAAAG.AGTT.CCTG.GGCTGTGTAG ACAT.A.TGTTAACACCACCTATAATAGAAGC.TTATATTCACC.TTAAC GAATTGT.TTCAATGCCACATA.GCTGGGCCAC.GCCTTAGGCTATTT.A AGCTTTACAGATGT.ACCAAA.CCAG.GACTAGATGGGGGGG.ATATGCC AGAAAGCCC.TCCCTGCGAAT.TGGGGGAGTACT.ATAGACTA.GGCCAG .CACTCCGG.CTAACACTACTTCCTGTAACAAA.CTGAAGAGA.CGTTTG CAGTCCTGAGCGTGCTAC..GTTCT.CTGTCG.TTAGATGGGCGC..GTA AGCAAGAAAACACGTACAGAAAAAGCCGACGGC.GGGGTTC.GTCAACC. T.ACTTATCGACATTAGCATCTGG.TGCCGCCTTT.GAACCCATTACTCG TAAACGA.CACCGCTTAGGGCTGACCTCCGAA.TTATCAGAGTACCGGGC ACGGATATCCTA.ATAGA.TACCTATGTAGGAAG.TCGAACTC.TAAACT ACCCCCGTCC.TATCCTC.CGA.ATTGCCCCCGGCCGTACTCCA.AAACC T..TC.AGCCGGG.TTG.TGTGGATCAGAAGTGATTGAGACTGG.GCCTA TCAACCTTGAAGTTGAAACTCCGAGAGT.CGCGGTCAACC.AGCCTGCGC GATCAGTGAGGAGCTGT.C.AAAGC.TC.AG.GCC.G.GA.GTGAGATT. G.C.AGTTTGCCACTT.CATCCAGAGTTCATCCAGCAGG.ATTGAAGTTA GCGCTGCA.CTGT.CATTTTTTATTCCCACCCGGTCTCCCCAACCCCGAT TGTATAGCCAGG.CGGAG.TTTCTT.TAGATCTAGTAAGACATT.CCCGA AC.CCCCT.G.TAGCTAAATCGACGGG..GTCAATATACACGGAT.CTTT AAGC.GGGCTGATGCTTATCTCCTAGCCCGC.CCTCGATGAT.ATTATTG TCACCC.ACCGC..T.CGGAACGAAAACT..AT.CCTATTAGAACATCCT .GCGTGAACC.TG.G.TACACCG.GATGGTCCCGAC.GGCTACCGAT.CA .CGCTGCCCCCATCTCCAGCTATC.AAA.AGGGACGCGATATGCGGAAGC CGTTCAGACAATCCTTTTGGGGTGTAAATGTCTCC.ACCTC.GAGA.CTG AAGACATAGGAG.CCAGAAT.A.CGTCATACAGAGGCACTC.TAT..TCT AGGCCTGCCTACTT.TTGGCTAA.C.AGACTTGG.AAG.ATGTAGAAC.G GTCAACCCGTGCTAACTGGGGTGAGGAATCTTCCGAGCC.TGCTCGTCGC TCG.TGGCAGCTC.AACTGGTGCGCGGCAG.CCTCCTGCCAAGTATTCAG CGGGGGCTAC..GTT.TCATCGAACACGGCACACTAACAAACTCCTGTGT TTCCGGGGTCGACCCTTTGGCCCAAGAGTGA.AGGGCTTCG.ACTGCG.. AA.CGTGCGGGTAGCCTAGACACGTAGC.TTGTGCGC.CG.CGCA.CAAG AT.TATGTCA.ACTTCCGCCGGG.CTTCTGTGTACATT.AA.GAGAATAA CGGCAAGGATTGCTCGACGTAGGAGTCCGTGGAGCTCGTGC.GACC.ACC .GTTAA.CACGCCTTAACTTTTCGGAACAGAGTAAC.AATCCCGG.TA.C CAGGTAATGTGTCACCAGGTTCGGGCCCT.CACCGTCCCAGCTAC.TGTT TAGCCCCTCTTTT.GAT.GGCCC.AGCGACATCAA.TGATC..CTGTAGG CGATCATATTTCATTGTTCCGC.TG.AGCGGT.A.TG.GCAAT.CAGCCG ATCGATGTTCTGATATG..GTGTGAATAC.AGAAACCGGCTTTGTCGGGG CCTTT.AGGAA.AC.TAGGT.TT.CTCAATGAAC.GACATCAAC.T.AGC AAGGAGGTACACAGCGTTCAGCGGATCC.CT.AAGC.TAGCATCTGCTGA GCAGATA.A.TCTGCTTCAACTGTGAAAGGGTTG.CTAATCAG.GCGTAG GGTACGATC.GCAAC.AGTCCACAAGTACACGGTGGAATT.CC..C.TTG __EOD__ my $fileStart = tell $inFH; my $offset = 9; # Column 10 if numbering from 1 my $testOK = q{C.T.AGCTGTTG..GAGACCCAGGCAGTCCCAGTTGCCGATCTTTCAC..}; my %methods = ( rsubstr => sub { # BrowserUk's method seek $inFH, 0, 0; my $buffer; my $rsWanted = \ substr $buffer, $offset, 1; my $retStr; while ( $buffer = <$inFH> ) { $retStr .= ${ $rsWanted }; } return \ $retStr; }, brutish => sub { # Adapted from Anonymonk's brute force method seek $inFH, 0, 0; my $retStr; while ( <$inFH> ) { my @split = split m{}; $retStr .= $split[ $offset ]; } return \ $retStr; }, seek => sub { # Suggested by vr seek $inFH, 0, 0; my $retStr; my $char; my $len = length( <$inFH> ) - 1; seek $inFH, $offset, 0; while ( read $inFH, $char, 1 ) { $retStr .= $char; seek $inFH, $len, 1 } return \ $retStr; }, substr => sub { # Suggested by pryrt and Laurent_R seek $inFH, 0, 0; my $retStr; while ( <$inFH> ) { $retStr .= substr $_, $offset, 1; } return \ $retStr; }, regex => sub { # Another of pryrt's solutions seek $inFH, 0, 0; my $retStr; while ( <$inFH> ) { $retStr .= $1 if m{^.{$offset}(.)}; } return \ $retStr; }, unpack => sub { # Suggested but not implemented by LanX & pryrt seek $inFH, 0, 0; my $retStr; my $fmt = qq{x${offset}a}; while ( <$inFH> ) { $retStr .= unpack $fmt, $_; } return \ $retStr; }, unpackM => sub { # Multi-line unpack suggested by LanX seek $inFH, 0, 0; my $buffer = <$inFH>; my $lineLen = length $buffer; my $nLines = 500; my $chunkSize = $lineLen * $nLines; seek $inFH, 0, 0; my $retStr; my $fmt = qq{(x${offset}ax@{ [ $lineLen - $offset - 1 ] })*}; while ( my $bytesRead = read $inFH, $buffer, $chunkSize ) { $retStr .= join q{}, unpack $fmt, $buffer; } return \ $retStr; }, split => sub { # Simple split from thanos1983 seek $inFH, 0, 0; my $retStr; my $fmt = qq{x${offset}a}; while ( <$inFH> ) { $retStr .= ( split m{} )[ $offset ]; } return \ $retStr; }, pushAoA => sub { # In memory solution from Discipulus seek $inFH, 0, 0; my @aoa; while ( <$inFH> ) { chomp; push @aoa, [ split m{} ]; } my $retStr = join q{}, map { $aoa[ $_ ]->[ $offset ] } 0 .. $#aoa; return \ $retStr; }, ANDmask => sub { # Multi-line AND mask by johngg seek $inFH, 0, 0; my $buffer = <$inFH>; my $lineLen = length $buffer; my $nLines = 500; my $chunkSize = $lineLen * $nLines; seek $inFH, 0, 0; my $retStr; my $mask = qq{\x00} x ${offset} . qq{\xff} . qq{\x00} x ( $lineLen - $offset - 1 ); $mask x= $nLines; while ( my $bytesRead = read $inFH, $buffer, $chunkSize ) { ( my $anded = $buffer & $mask ) =~ tr{\x00}{}d; $retStr .= $anded; } return \ $retStr; }, ); foreach my $method ( sort keys %methods ) { ok( ${ $methods{ $method }->() } eq $testOK, $method ); } close $inFH or die $!; my $filename = q{spw1202693.txt}; open $inFH, q{<}, $filename or die qq{open: < $filename: $!\n}; cmpthese( -3, { map { my $codeStr = q[sub { my $col = $methods{ ] . $_ . q[ }->(); }]; $_ => eval $codeStr; } keys %methods } ); close $inFH or die qq{close: < $filename: $!\n}; #### ok 1 - ANDmask ok 2 - brutish ok 3 - pushAoA ok 4 - regex ok 5 - rsubstr ok 6 - seek ok 7 - split ok 8 - substr ok 9 - unpack ok 10 - unpackM Rate pushAoA brutish split seek regex unpack substr rsubstr unpackM ANDmask pushAoA 1.11/s -- -35% -61% -62% -91% -97% -98% -98% -98% -99% brutish 1.71/s 55% -- -39% -41% -86% -95% -96% -96% -97% -98% split 2.82/s 155% 65% -- -3% -77% -92% -94% -94% -95% -97% seek 2.91/s 163% 70% 3% -- -76% -92% -94% -94% -95% -97% regex 12.3/s 1010% 617% 336% 322% -- -65% -74% -75% -79% -88% unpack 35.0/s 3060% 1943% 1141% 1102% 185% -- -25% -27% -40% -67% substr 46.9/s 4137% 2638% 1564% 1512% 282% 34% -- -3% -20% -55% rsubstr 48.2/s 4254% 2714% 1610% 1556% 292% 38% 3% -- -18% -54% unpackM 58.7/s 5194% 3321% 1979% 1914% 377% 68% 25% 22% -- -44% ANDmask 105/s 9407% 6045% 3634% 3517% 757% 201% 124% 118% 80% -- 1..10