#!/usr/bin/perl use Modern::Perl '2012'; use autodie; use Parallel::ForkManager; use File::Path qw/ make_path remove_tree /; use File::Find; use File::Basename; use Getopt::Std; use Cwd qw/ getcwd abs_path /; # 28-04-2014 gsa =head Wrapper to batch QC process PE files. Output stored in 'trimmed' folder, capture STDOUT to file. Run within directory containing input files TO DO: 1. Arguments are hardcoded. Use Getopt::Std to take args from command line. 2. Only PE files right now. Allow either PE or SE files. =cut my $start = time; # dirty measure of execution time # change if harvesting compressed/archived fastq or some other suffix my $cwd = getcwd(); my @inFiles; find( # get all fastq files from subfolders sub { push @inFiles, $File::Find::name if -f and $File::Find::name =~ /\.fastq\z/; }, $cwd ); @inFiles = sort @inFiles; die "Usage:$0" unless @inFiles; my %rdCountHoH; # how many reads left after each step for my $infile ( @inFiles ) { my @wc = split ' ', `wc -l $infile`; $rdCountHoH{ RAW }{ basename $infile } = $wc[0] / 4; } # create output directories, tmp is for BMTagger intermediate files make_path( "decontaminated/tmp" ), 0755 or die "cannot make 'trimmed' directory: $!"; mkdir 'FastQC_of_raw_data', 0755 or die "cannot make 'FastQC_of_raw_data' directory: $!"; mkdir 'trimmed', 0755 or die "cannot make 'FastQC_of_raw_data' directory: $!"; my $prefix = 'trimmed/'; # for trimmed files (Trimmomatic output files) my $prefix2 = 'decontaminated/'; # for decontaminated files (BMTagger output files) #my $path2trimmomatic = $ENV{"TRIMMOMATIC"}; # environmental variable for path on Odyssey # declare sub ahead because it's prototype sub natatime($@); my $iter = natatime( 2, @inFiles ); # fork pipeline my $max_processes = 4; # max number of processes to run my $pm = Parallel::ForkManager->new( $max_processes ); while ( my @temp = $iter->() ) { my $pid = $pm->start and next; # start child process # there are 2 output files per input file for Trimmomatic my ( $out1, $out2 , @outfiles, $trimlog ); # Trimmomatic trim log file ( $trimlog = basename $temp[0] ) =~ s/\_1\.fastq\z/_TRIMLOG.txt/; $trimlog = $prefix . $trimlog; for my $infile ( @temp ) { # launch FastQC system( "fastqc $infile -t 2 -o FastQC_of_raw_data" ) == 0 or die "Cannot FastQC $infile: $!"; # create Trimmomatic output file names my $out = $prefix . ( basename $infile ); $out1 = $out2 = $out; $out1 =~ s/(.*)\.fastq\z/$1_pair.fastq/; $out2 =~ s/(.*)\.fastq\z/$1_orphan.fastq/; push @outfiles, ($out1, $out2); } my @trimmed = @outfiles; # trimmed files are input for BMTagger # now push input file names to beginning of array for Trimmomatic unshift @outfiles, @temp; say "\nNow trimming @temp\n"; # arguments for Trimmomatic my @sysargs = ( "java", "-jar", "/home/gabuali/PROGRAMS/Trimmomatic-0.32/trimmomatic-0.32.jar", "PE", "-threads", "2", #"-trimlog", $trimlog, # uncomment if you want trimlog @outfiles, "SLIDINGWINDOW:4:20", "MINLEN:70", ); # launch Trimmomatic system ( @sysargs ) == 0 or die "The system has failed you: $!, $?"; # BMTagger - separate input files into paired and unpaired, # and generate output names my ( @paired, @unpaired, $unpairDecont, $pairDecont ); for my $trim ( @trimmed ) { my @wc = split ' ', `wc -l $trim`; # read count of file $rdCountHoH{ TRIM }{ basename $wc[1] } = $wc[0] / 4; if ( $trim =~ /orphan\./ ) { push @unpaired, $trim; $unpairDecont = $prefix2 . ( basename $trim ); $unpairDecont =~ s/\.fastq//; push @unpaired, $unpairDecont; } else { push @paired, $trim; unless ( $pairDecont ) { $pairDecont = $prefix2 . ( basename $trim ); $pairDecont =~ s/\_\d\_pair\.fastq//; push @paired, $pairDecont; } } } say "@paired"; # paired files my @sysargsBMTpair = ( '/home/gabuali/PROGRAMS/bmtagger/bmtools/bmtagger/bmtagger.sh', '-b', '/home/gabuali/PROGRAMS/bmtagger/hg38.bitmask', '-x', '/home/gabuali/PROGRAMS/bmtagger/hg38.srprism', '-q1', '-1', "/home/gabuali/data/efranzosa/hpfs/test4QC/$paired[0]", '-2', "/home/gabuali/data/efranzosa/hpfs/test4QC/$paired[2]", '-o', "/home/gabuali/data/efranzosa/hpfs/test4QC/$paired[1]", '-X', ); # launch BMTagger for paired read files system ( @sysargsBMTpair ) == 0 or die "Cannot decontaminate @paired[0,2]: $!"; # delete input (trimmed) files unlink ( $paired[0], $paired[2] ); # get read count of decontaminated file for ( $pairDecont . '_1.fastq', $pairDecont . '_2.fastq' ) { sleep(1); my @wc = split ' ', `wc -l $_`; $rdCountHoH{ DECONTAMINATE }{ basename $wc[1] } = $wc[0] / 4; } # unpaired files my @sysargsBMTunpair; my $iter2 = natatime( 2, @unpaired ); while ( my @temp = $iter2->() ) { @sysargsBMTunpair = ( '/home/gabuali/PROGRAMS/bmtagger/bmtools/bmtagger/bmtagger.sh', '-b', '/home/gabuali/PROGRAMS/bmtagger/hg38.bitmask', '-x', '/home/gabuali/PROGRAMS/bmtagger/hg38.srprism', '-q1', '-1', "/home/gabuali/data/efranzosa/hpfs/test4QC/$temp[0]", '-o', "/home/gabuali/data/efranzosa/hpfs/test4QC/$temp[1]", '-X', ); # launch BMTagger for orphan read files system ( @sysargsBMTunpair ) == 0 or die "Cannot decontaminate $temp[0]: $!"; # delete input (trimmed) file unlink $temp[0]; my $out = $temp[1] . '.fastq'; my @wc = split ' ', `wc -l $out`; $rdCountHoH{ DECONTAMINATE }{ basename $wc[1] } = $wc[0] / 4; } $pm->finish; # terminate child process } $pm->wait_all_children; # get read count from decontaminated files #chomp( my @decontaminatedRdCount = `wc -l decontaminated/*` ); #for my $rdCount ( @decontaminatedRdCount ) { # my @temp = split ' ', $rdCount; # $rdCountHoH{ DECONTAMINATE }{ basename $temp[1] } = $temp[0] / 4; #} # get read count from trimmed files #chomp( my @trimmed = `wc -l trimmed/*` ); #for my $rdCount ( @trimmed ) { #my @temp = split ' ', $rdCount; #$rdCountHoH{ TRIM }{ basename $temp[1] } = $temp[0] / 4; # delete (intermediate) trimmed file #unless( $temp[1] eq 'total' ) { # unlink $temp[1] or warn "Cannot delete trimmed file $temp[1]"; #} #} rmdir 'decontaminated/tmp'; rmdir 'trimmed'; # iterate over sample names and print out file read count my %samples; for my $infile ( @inFiles ) { $infile = basename $infile; $infile =~ s/\_\d\.fastq//; $samples{ $infile }++; } open my $fh, '>', 'readCount.txt'; say {$fh} "Sample\tRaw\tRaw\tTrimmed\tTrimmed\tTrimmed\tTrimmed\tDecontaminated\tDecontaminated\tDecontaminated\tDecontaminated"; say {$fh} "\tpair1\tpair2\tpair1\tpair2\torphan1\torphan2\tpair1\tpair2\torphan1\torphan2"; for my $sample ( sort keys %samples ) { print {$fh} "$sample\t"; print {$fh} $rdCountHoH{ RAW }{ $sample . '_1.fastq' } // 'not done', "\t"; print {$fh} $rdCountHoH{ RAW }{ $sample . '_2.fastq' } // 'not done', "\t"; print {$fh} $rdCountHoH{ TRIM }{ $sample . '_1_pair.fastq' } // 'not done', "\t"; print {$fh} $rdCountHoH{ TRIM }{ $sample . '_2_pair.fastq' } // 'not done', "\t"; print {$fh} $rdCountHoH{ TRIM }{ $sample . '_1_orphan.fastq' } // 'not done', "\t"; print {$fh} $rdCountHoH{ TRIM }{ $sample . '_2_orphan.fastq' } // 'not done', "\t"; print {$fh} $rdCountHoH{ DECONTAMINATE }{ $sample . '_1.fastq' } // 'not done', "\t"; print {$fh} $rdCountHoH{ DECONTAMINATE }{ $sample . '_2.fastq' } // 'not done', "\t"; print {$fh} $rdCountHoH{ DECONTAMINATE }{ $sample . '_1_orphan.fastq' } // 'not done', "\t"; print {$fh} $rdCountHoH{ DECONTAMINATE }{ $sample . '_2_orphan.fastq' } // 'not done', "\n"; # compress and archive each sample chdir 'decontaminated/'; system( "tar -cvzf ../$sample-QC.tar.gz $sample*" ) == 0 or die "Cannot archive $sample: $!"; chdir '../'; } # delete uncompressed decontaminated/files remove_tree( './decontaminated' ); #print Dumper( \ %rdCountHoH ); use Data::Dumper; my $duration = time - $start; printf "%s %.2f\n", "Execution time (minutes): ", $duration / 60; # execution time in minutes # lifted from List::MoreUtils for portability to Odyssey # makes copy of array and destroys it. sub natatime ($@) { my $n = shift; my @list = @_; return sub { return splice @list, 0, $n; } }