comment on

Dear All, I am putting together a pipeline for processing DNA seqeuencing files, and it works peachy except for a small detail. As input files go through filtering steps, I ask what is the line count of intermediate files, using backticks and storing output in a variable. Now, when I use backticks before forking child processes, the output of backticks is stored into a variable (line 37 in below code). When I use backticks in child process, its output is not stored in variable (lines 95 and 132). At your convenience, please let me know what I'm doing wrong. many thanks!

#!/usr/bin/perl
use Modern::Perl '2012';
use autodie;
use Parallel::ForkManager;
use File::Path qw/ make_path remove_tree /;
use File::Find;
use File::Basename;
use Getopt::Std;
use Cwd qw/ getcwd abs_path /;

# 28-04-2014    gsa

=head
Wrapper to batch QC process PE files.
Output stored in 'trimmed' folder, capture STDOUT to file.
Run within directory containing input files
TO DO:
1. Arguments are hardcoded. Use Getopt::Std to take args from command 
+line.
2. Only PE files right now. Allow either PE or SE files.
=cut


my $start = time; # dirty measure of execution time
# change if harvesting compressed/archived fastq or some other suffix
my $cwd = getcwd();
my @inFiles;
find( # get all fastq files from subfolders
    sub {
        push @inFiles, $File::Find::name if -f and $File::Find::name =
+~ /\.fastq\z/;
    }, $cwd
);
@inFiles = sort @inFiles;
die "Usage:$0" unless @inFiles;

my %rdCountHoH; # how many reads left after each step
for my $infile ( @inFiles ) {
    my @wc = split ' ', `wc -l $infile`;
    $rdCountHoH{ RAW }{ basename $infile } = $wc[0] / 4;
}

# create output directories, tmp is for BMTagger intermediate files
make_path( "decontaminated/tmp" ), 0755 or die "cannot make 'trimmed' 
+directory: $!";
mkdir 'FastQC_of_raw_data', 0755 or die "cannot make 'FastQC_of_raw_da
+ta' directory: $!";
mkdir 'trimmed', 0755 or die "cannot make 'FastQC_of_raw_data' directo
+ry: $!";
my $prefix = 'trimmed/'; # for trimmed files (Trimmomatic output files
+)
my $prefix2 = 'decontaminated/'; # for decontaminated files (BMTagger 
+output files)
#my $path2trimmomatic = $ENV{"TRIMMOMATIC"}; # environmental variable 
+for path on Odyssey

# declare sub ahead because it's prototype
sub natatime($@);
my $iter = natatime( 2, @inFiles );

# fork pipeline
my $max_processes = 4; # max number of processes to run
my $pm = Parallel::ForkManager->new( $max_processes );
while ( my @temp = $iter->() ) {
    my $pid = $pm->start and next; # start child process
    # there are 2 output files per input file for Trimmomatic
    my ( $out1, $out2 , @outfiles, $trimlog );     
    # Trimmomatic trim log file
    ( $trimlog = basename $temp[0] ) =~ s/\_1\.fastq\z/_TRIMLOG.txt/;
    $trimlog = $prefix . $trimlog;
    for my $infile ( @temp ) {
        # launch FastQC
        system( "fastqc $infile -t 2 -o FastQC_of_raw_data" ) == 0 
            or die "Cannot FastQC $infile: $!";
        # create Trimmomatic output file names
        my $out = $prefix . ( basename $infile );
        $out1 = $out2 = $out;
        $out1 =~ s/(.*)\.fastq\z/$1_pair.fastq/;
        $out2 =~ s/(.*)\.fastq\z/$1_orphan.fastq/;
        push @outfiles, ($out1, $out2);
    }

    my @trimmed = @outfiles; # trimmed files are input for BMTagger

    # now push input file names to beginning of array for Trimmomatic
    unshift @outfiles, @temp; 
    say "\nNow trimming @temp\n";
    # arguments for Trimmomatic
    my @sysargs = ( 
        "java", "-jar", 
        "/home/gabuali/PROGRAMS/Trimmomatic-0.32/trimmomatic-0.32.jar"
+, 
        "PE", "-threads", "2", #"-trimlog", $trimlog, # uncomment if y
+ou want trimlog
        @outfiles,
        "SLIDINGWINDOW:4:20", "MINLEN:70", 
    );
    # launch Trimmomatic
    system ( @sysargs ) == 0 or die "The system has failed you: $!, $?
+";
    
    # BMTagger - separate input files into paired and unpaired,
    # and generate output names
    my ( @paired, @unpaired, $unpairDecont, $pairDecont );
    for my $trim ( @trimmed ) {
        my @wc = split ' ', `wc -l $trim`; # read count of file
        $rdCountHoH{ TRIM }{ basename $wc[1] } = $wc[0] / 4;

        if ( $trim =~ /orphan\./ ) {
            push @unpaired, $trim;
            $unpairDecont = $prefix2 . ( basename $trim ); 
            $unpairDecont =~ s/\.fastq//;
            push @unpaired, $unpairDecont;
        }
        else {
            push @paired, $trim;
            unless ( $pairDecont ) {
                $pairDecont = $prefix2 . ( basename $trim );   
                $pairDecont =~ s/\_\d\_pair\.fastq//;
                push @paired, $pairDecont;
            }
        }
    }
    say "@paired";
    # paired files
    my @sysargsBMTpair = (
        '/home/gabuali/PROGRAMS/bmtagger/bmtools/bmtagger/bmtagger.sh'
+,
        '-b', '/home/gabuali/PROGRAMS/bmtagger/hg38.bitmask', 
        '-x', '/home/gabuali/PROGRAMS/bmtagger/hg38.srprism', 
        '-q1', 
        '-1', "/home/gabuali/data/efranzosa/hpfs/test4QC/$paired[0]", 
        '-2', "/home/gabuali/data/efranzosa/hpfs/test4QC/$paired[2]", 
        '-o', "/home/gabuali/data/efranzosa/hpfs/test4QC/$paired[1]", 
        '-X',
    );
    # launch BMTagger for paired read files
    system ( @sysargsBMTpair ) == 0 or die "Cannot decontaminate @pair
+ed[0,2]: $!";
    # delete input (trimmed) files
    unlink ( $paired[0], $paired[2] );
    # get read count of decontaminated file
    for ( $pairDecont . '_1.fastq', $pairDecont . '_2.fastq' ) {
        sleep(1);
        my @wc = split ' ', `wc -l $_`;
        $rdCountHoH{ DECONTAMINATE }{ basename $wc[1] } = $wc[0] / 4;
    }

    # unpaired files
    my @sysargsBMTunpair;
    my $iter2 = natatime( 2, @unpaired );
    while ( my @temp = $iter2->() ) {
        @sysargsBMTunpair = (
            '/home/gabuali/PROGRAMS/bmtagger/bmtools/bmtagger/bmtagger
+.sh',
            '-b', '/home/gabuali/PROGRAMS/bmtagger/hg38.bitmask', 
            '-x', '/home/gabuali/PROGRAMS/bmtagger/hg38.srprism', 
            '-q1', 
            '-1', "/home/gabuali/data/efranzosa/hpfs/test4QC/$temp[0]"
+, 
            '-o', "/home/gabuali/data/efranzosa/hpfs/test4QC/$temp[1]"
+, 
            '-X',
        );
        # launch BMTagger for orphan read files
        system ( @sysargsBMTunpair ) == 0 or die "Cannot decontaminate
+ $temp[0]: $!";
        # delete input (trimmed) file
        unlink $temp[0];
        my $out = $temp[1] . '.fastq';
        my @wc = split ' ', `wc -l $out`;
        $rdCountHoH{ DECONTAMINATE }{ basename $wc[1] } = $wc[0] / 4;
    }

    $pm->finish; # terminate child process

}

$pm->wait_all_children;


# get read count from decontaminated files
#chomp( my @decontaminatedRdCount = `wc -l decontaminated/*` );
#for my $rdCount ( @decontaminatedRdCount ) {
#    my @temp = split ' ', $rdCount;
#    $rdCountHoH{ DECONTAMINATE }{ basename $temp[1] } = $temp[0] / 4;
#}
# get read count from trimmed files
#chomp( my @trimmed = `wc -l trimmed/*` );
#for my $rdCount ( @trimmed ) {
    #my @temp = split ' ', $rdCount;
    #$rdCountHoH{ TRIM }{ basename $temp[1] } = $temp[0] / 4;
    # delete (intermediate) trimmed file
    #unless( $temp[1] eq 'total' ) {
    #    unlink $temp[1] or warn "Cannot delete trimmed file $temp[1]"
+;
    #}
#}

rmdir 'decontaminated/tmp';
rmdir 'trimmed';


# iterate over sample names and print out file read count
my %samples;
for my $infile ( @inFiles ) {
    $infile = basename $infile;
    $infile =~ s/\_\d\.fastq//;
    $samples{ $infile }++;
}

open my $fh, '>', 'readCount.txt';
say {$fh} "Sample\tRaw\tRaw\tTrimmed\tTrimmed\tTrimmed\tTrimmed\tDecon
+taminated\tDecontaminated\tDecontaminated\tDecontaminated";
say {$fh} "\tpair1\tpair2\tpair1\tpair2\torphan1\torphan2\tpair1\tpair
+2\torphan1\torphan2";
for my $sample ( sort keys %samples ) {
    print {$fh} "$sample\t";
    print {$fh} $rdCountHoH{ RAW }{ $sample . '_1.fastq' } // 'not don
+e', "\t";
    print {$fh} $rdCountHoH{ RAW }{ $sample . '_2.fastq' } // 'not don
+e', "\t";
    print {$fh} $rdCountHoH{ TRIM }{ $sample . '_1_pair.fastq' } // 'n
+ot done', "\t";
    print {$fh} $rdCountHoH{ TRIM }{ $sample . '_2_pair.fastq' } // 'n
+ot done', "\t";
    print {$fh} $rdCountHoH{ TRIM }{ $sample . '_1_orphan.fastq' } // 
+'not done', "\t";
    print {$fh} $rdCountHoH{ TRIM }{ $sample . '_2_orphan.fastq' } // 
+'not done', "\t";
    print {$fh} $rdCountHoH{ DECONTAMINATE }{ $sample . '_1.fastq' } /
+/ 'not done', "\t";
    print {$fh} $rdCountHoH{ DECONTAMINATE }{ $sample . '_2.fastq' } /
+/ 'not done', "\t";
    print {$fh} $rdCountHoH{ DECONTAMINATE }{ $sample . '_1_orphan.fas
+tq' } // 'not done', "\t";
    print {$fh} $rdCountHoH{ DECONTAMINATE }{ $sample . '_2_orphan.fas
+tq' } // 'not done', "\n";

    # compress and archive each sample
    chdir 'decontaminated/';
    system( "tar -cvzf ../$sample-QC.tar.gz $sample*" ) == 0 or die "C
+annot archive $sample: $!";
    chdir '../';
}
# delete uncompressed decontaminated/files
remove_tree( './decontaminated' );


#print Dumper( \ %rdCountHoH ); use Data::Dumper;

my $duration = time - $start;
printf "%s %.2f\n", "Execution time (minutes): ", $duration / 60; # ex
+ecution time in minutes

# lifted from List::MoreUtils for portability to Odyssey
# makes copy of array and destroys it.
sub natatime ($@) {
    my $n = shift;
    my @list = @_;

    return sub {
        return splice @list, 0, $n;
    }
}
[download]

In reply to capturing output of backticks in child process... or something completely different by abualiga

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.