This is untested (so may need minor tweaks), and the error messages and checking are prefunctory, but it should be close:
Update: Corrected a few typos and tested on a small (500,000) sequence dataset (It took 3.5 minutes, so yours should be done in around an hour. It uses < 2 MB ram)
#! perl -slw use strict; use Time::HiRes qw[ time ]; sub readFASTA { my( $fh ) = @_; local $/ = "\n"; my $header = <$fh>; $header = substr $header, 1 if ord $header == ord '>'; ## First li +ne chomp $header; local $/ = '>'; my $seq = <$fh>; chomp $seq; ## get rid of trailing '>'; $seq =~ tr[\n][]d; return $header, \$seq; } sub writeFASTA { local $\ = "\n"; my( $fh, $header, $refSeq, $lineLength ) = @_; $lineLength ||= 50; my $length = length( $$refSeq ); my $chunks = int( $length / $lineLength ); --$chunks unless $length % $lineLength; print $fh '>', $header; print $fh substr $$refSeq, $_*$lineLength, $lineLength for 0 .. $c +hunks; return; } open SMALL, '<', 'theSmallFile' or die $!; open BIG, '<', 'theBigFile' or die $!; until( eof( SMALL ) ) { my( $bigHdr, $bigRef ) = readFASTA( \*BIG ); my( $hdrLeft, $leftRef ) = readFASTA( \*SMALL ); my( $leftId, $leftLen ) = $hdrLeft =~ m[([^_]+)_left\s+length=(\d ++)]; die "Bad small left header '$hdrLeft'" unless defined $leftId and defined $leftLen; my( $hdrRight, $rightRef ) = readFASTA( \*SMALL ); my( $rightId, $rightLen ) = $hdrRight =~ m[([^_]+)_right\s+length= +(\d+)]; die "Bad small right header '$hdrRight'" unless defined $rightId and defined $rightLen; die "Out of sequence data at record $.: $bigHdr/$leftId/$rightId" unless $bigHdr eq $leftId and $bigHdr eq $rightId; my @bits = split ' ', $$bigRef; writeFASTA( \*STDOUT, $hdrLeft, \join(' ', @bits[ 0 .. $leftLen -1 ] ), 1e6 ); writeFASTA( \*STDOUT, $hdrRight, \join( ' ', @bits[ -$rightLen .. -1 ] ), 1e6 ); }
In reply to Re^3: When the input file is huge !!!
by BrowserUk
in thread When the input file is huge !!!
by sugar
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |