Here is a threaded version. It should be readily adaptable to forking if that is your thing:
#! perl -slw use strict; use threads; use threads::shared; ## threadsafe output routines. $|++; ## Doesn't work without this! my $semStdout :shared; sub tprint{ lock $semStdout; print @_; } my $semStderr :shared; sub twarn{ lock $semStderr; print STDERR @_; } sub findNextRecStart { ## filehandle, calculated start byte, thread id (for tracing my( $fh, $start, $tid ) = @_; # twarn "[$tid] $start"; ## seek to the start byte -1; Just incase the calculated posn hits + bang on seek $fh, $start-1, 0; ## Read a buffer full; we'd be really unluck to not find a record +start in 4k ## But you could increase this to say 64k if it fails. read( $fh, my $buffer, 4096 ); ## Search for a full record that doesn't have @/+ as the first cha +r in the 2nd line $buffer =~ m[\n(\@)(?:[^@+][^\n]+\n){2}\+] or die "Couldn't locate + record start"; ## Remember the offset into the buffer where we found it. my $startFound = $-[1]; ## Now count the lines between the start of the buffer and that po +int. my $previousLines = substr( $buffer, 0, $startFound ) =~ tr[\n][\n +]; ## And calulate our way back to the first full record after the ca +lculated start posn. my $skipLines = ( $previousLines - 1) % 4 +1; # twarn "[$tid] $skipLines"; ## Seek bck to that calculated start posn. seek $fh, $start, 0; ## Then skip forward th calculate dnumber of lines. scalar <$fh> for 1 .. $skipLines; # twarn "[$tid] ", tell $fh; return; } sub worker { my $tid = threads->tid; ## the name of the file, the byte offsets for the thread ## to start and end processing at my( $file, $start, $end ) = @_; open my $FASTQ, '<', $file or die $!; ## If a no-zero start posns, find the start of the next full recor +d. findNextRecStart( $FASTQ, $start, $tid ) if $start; ## process records until the end of this threads section. while( tell( $FASTQ ) < $end ) { my @lines = map scalar( <$FASTQ> ), 1 .. 4; chomp @lines; ## process this record ( in @lines[ 0 .. 3 ] ) here... tprint "[$tid] $lines[0]"; } } ## Grab the size of the file my $size = -s $ARGV[0] or die "$! : $ARGV[ 0 ]"; ## Calculate each threads start posn my $quarter = int( $size / 4 ); my @starts = map $quarter * $_, 0 .. 3; push @starts, $size; ## Start 4 threads and wait for them to end. $_->join for map{ async( \&worker, $ARGV[ 0 ], @starts[ $_, $_ +1 ] ) } 0 .. 3;
And a test. Generate a fastq file with a mix of good and dodgey records (Every second record is clean!):
perl -E"say qq[\@record ${\($_-1)}\n\@pqrs\n+record ${\($_-1)}\n+pqrs\ +n\@record $_\nabcd\n+record $_\nefgh] for map{ $_*2-1 } 1 .. 25" > te +st.fastq C:\test>head test.fastq @record 0 @pqrs +record 0 +pqrs @record 1 abcd +record 1 efgh @record 2 @pqrs
And process that file using 4 threads, outputting just the first line of each record:
C:\test>988536 test.fastq [1] @record 0 [1] @record 1 [1] @record 2 [1] @record 3 [1] @record 4 [2] @record 13 [2] @record 14 [1] @record 5 [1] @record 6 [1] @record 7 [1] @record 8 [2] @record 15 [2] @record 16 [1] @record 9 [4] @record 38 [3] @record 26 [2] @record 17 [1] @record 10 [4] @record 39 [3] @record 27 [2] @record 18 [1] @record 11 [4] @record 40 [3] @record 28 [2] @record 19 [1] @record 12 [4] @record 41 [3] @record 29 [2] @record 20 [4] @record 42 [3] @record 30 [2] @record 21 [4] @record 43 [3] @record 31 [2] @record 22 [4] @record 44 [3] @record 32 [2] @record 23 [4] @record 45 [3] @record 33 [2] @record 24 [4] @record 46 [3] @record 34 [2] @record 25 [4] @record 47 [3] @record 35 [4] @record 48 [4] @record 49 [3] @record 36 [3] @record 37
If you check the records numbers, all 50 resords (0 through 49) are processed correctly.
Now run the program on a 400 MB fastq file:
[ 8:52:25.06] C:\test>988536 sample.fastq | wc -l 2500000 [ 8:52:57.92] C:\test>
2.5 million records in 32 seconds.
In reply to Re: multiple processes to access one file
by BrowserUk
in thread multiple processes to access one file
by julio_514
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |