in reply to Re^2: When the input file is huge !!!
in thread When the input file is huge !!!
This is untested (so may need minor tweaks), and the error messages and checking are prefunctory, but it should be close:
Update: Corrected a few typos and tested on a small (500,000) sequence dataset (It took 3.5 minutes, so yours should be done in around an hour. It uses < 2 MB ram)
#! perl -slw use strict; use Time::HiRes qw[ time ]; sub readFASTA { my( $fh ) = @_; local $/ = "\n"; my $header = <$fh>; $header = substr $header, 1 if ord $header == ord '>'; ## First li +ne chomp $header; local $/ = '>'; my $seq = <$fh>; chomp $seq; ## get rid of trailing '>'; $seq =~ tr[\n][]d; return $header, \$seq; } sub writeFASTA { local $\ = "\n"; my( $fh, $header, $refSeq, $lineLength ) = @_; $lineLength ||= 50; my $length = length( $$refSeq ); my $chunks = int( $length / $lineLength ); --$chunks unless $length % $lineLength; print $fh '>', $header; print $fh substr $$refSeq, $_*$lineLength, $lineLength for 0 .. $c +hunks; return; } open SMALL, '<', 'theSmallFile' or die $!; open BIG, '<', 'theBigFile' or die $!; until( eof( SMALL ) ) { my( $bigHdr, $bigRef ) = readFASTA( \*BIG ); my( $hdrLeft, $leftRef ) = readFASTA( \*SMALL ); my( $leftId, $leftLen ) = $hdrLeft =~ m[([^_]+)_left\s+length=(\d ++)]; die "Bad small left header '$hdrLeft'" unless defined $leftId and defined $leftLen; my( $hdrRight, $rightRef ) = readFASTA( \*SMALL ); my( $rightId, $rightLen ) = $hdrRight =~ m[([^_]+)_right\s+length= +(\d+)]; die "Bad small right header '$hdrRight'" unless defined $rightId and defined $rightLen; die "Out of sequence data at record $.: $bigHdr/$leftId/$rightId" unless $bigHdr eq $leftId and $bigHdr eq $rightId; my @bits = split ' ', $$bigRef; writeFASTA( \*STDOUT, $hdrLeft, \join(' ', @bits[ 0 .. $leftLen -1 ] ), 1e6 ); writeFASTA( \*STDOUT, $hdrRight, \join( ' ', @bits[ -$rightLen .. -1 ] ), 1e6 ); }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^4: When the input file is huge !!!
by sugar (Beadle) on Jan 06, 2009 at 09:32 UTC | |
|
Re^4: When the input file is huge !!!
by sugar (Beadle) on Jan 06, 2009 at 05:21 UTC | |
by BrowserUk (Patriarch) on Jan 06, 2009 at 06:03 UTC |