#! perl -slw
use strict;
use threads;
use threads::shared;
## threadsafe output routines.
$|++; ## Doesn't work without this!
my $semStdout :shared;
sub tprint{ lock $semStdout; print @_; }
my $semStderr :shared;
sub twarn{ lock $semStderr; print STDERR @_; }
sub findNextRecStart {
## filehandle, calculated start byte, thread id (for tracing
my( $fh, $start, $tid ) = @_;
# twarn "[$tid] $start";
## seek to the start byte -1; Just incase the calculated posn hits bang on
seek $fh, $start-1, 0;
## Read a buffer full; we'd be really unluck to not find a record start in 4k
## But you could increase this to say 64k if it fails.
read( $fh, my $buffer, 4096 );
## Search for a full record that doesn't have @/+ as the first char in the 2nd line
$buffer =~ m[\n(\@)(?:[^@+][^\n]+\n){2}\+] or die "Couldn't locate record start";
## Remember the offset into the buffer where we found it.
my $startFound = $-[1];
## Now count the lines between the start of the buffer and that point.
my $previousLines = substr( $buffer, 0, $startFound ) =~ tr[\n][\n];
## And calulate our way back to the first full record after the calculated start posn.
my $skipLines = ( $previousLines - 1) % 4 +1;
# twarn "[$tid] $skipLines";
## Seek bck to that calculated start posn.
seek $fh, $start, 0;
## Then skip forward th calculate dnumber of lines.
scalar <$fh> for 1 .. $skipLines;
# twarn "[$tid] ", tell $fh;
return;
}
sub worker {
my $tid = threads->tid;
## the name of the file, the byte offsets for the thread
## to start and end processing at
my( $file, $start, $end ) = @_;
open my $FASTQ, '<', $file or die $!;
## If a no-zero start posns, find the start of the next full record.
findNextRecStart( $FASTQ, $start, $tid ) if $start;
## process records until the end of this threads section.
while( tell( $FASTQ ) < $end ) {
my @lines = map scalar( <$FASTQ> ), 1 .. 4;
chomp @lines;
## process this record ( in @lines[ 0 .. 3 ] ) here...
tprint "[$tid] $lines[0]";
}
}
## Grab the size of the file
my $size = -s $ARGV[0] or die "$! : $ARGV[ 0 ]";
## Calculate each threads start posn
my $quarter = int( $size / 4 );
my @starts = map $quarter * $_, 0 .. 3;
push @starts, $size;
## Start 4 threads and wait for them to end.
$_->join for map{
async( \&worker, $ARGV[ 0 ], @starts[ $_, $_ +1 ] )
} 0 .. 3;
####
perl -E"say qq[\@record ${\($_-1)}\n\@pqrs\n+record ${\($_-1)}\n+pqrs\n\@record $_\nabcd\n+record $_\nefgh] for map{ $_*2-1 } 1 .. 25" > test.fastq
C:\test>head test.fastq
@record 0
@pqrs
+record 0
+pqrs
@record 1
abcd
+record 1
efgh
@record 2
@pqrs
####
C:\test>988536 test.fastq
[1] @record 0
[1] @record 1
[1] @record 2
[1] @record 3
[1] @record 4
[2] @record 13
[2] @record 14
[1] @record 5
[1] @record 6
[1] @record 7
[1] @record 8
[2] @record 15
[2] @record 16
[1] @record 9
[4] @record 38
[3] @record 26
[2] @record 17
[1] @record 10
[4] @record 39
[3] @record 27
[2] @record 18
[1] @record 11
[4] @record 40
[3] @record 28
[2] @record 19
[1] @record 12
[4] @record 41
[3] @record 29
[2] @record 20
[4] @record 42
[3] @record 30
[2] @record 21
[4] @record 43
[3] @record 31
[2] @record 22
[4] @record 44
[3] @record 32
[2] @record 23
[4] @record 45
[3] @record 33
[2] @record 24
[4] @record 46
[3] @record 34
[2] @record 25
[4] @record 47
[3] @record 35
[4] @record 48
[4] @record 49
[3] @record 36
[3] @record 37
####
[ 8:52:25.06] C:\test>988536 sample.fastq | wc -l
2500000
[ 8:52:57.92] C:\test>