use strict; use warnings; use MCE::Loop; use MCE::Candy; # ensure two arguments are provided to the script my ($fileName,$keyword) = @ARGV; die "usage: $0 file keyword\n" if @ARGV != 2; # grab header line open INFILE, "<$fileName" || die "cannot open file to read: $!"; my $header = ; close INFILE; # utilize many core engine to filter file # out_iter_array returns a closure for gathering orderly my @rawData; MCE::Loop::init { max_workers => 'auto', # note: 'auto' is never higher than 8 gather => MCE::Candy::out_iter_array(\@rawData), use_slurpio => 1, }; mce_loop_f { my ($mce,$slurped_ref,$chunk_id) = @_; # quickly determine if the keyword is found; this is fast # think of this as short-circuiting unnecessary work my ($count, $foundData) = (0, ''); if ( 1 + index($$slurped_ref, $keyword) ) { open my $MEM_FH, '<', $slurped_ref; binmode $MEM_FH, ':raw'; # skip header line for the first chunk only if ($chunk_id == 1) { while (<$MEM_FH>) { if (/$keyword/) { next if $. == 1; # skip header line $foundData .= $_; # append line $count++; # increment count }} } # otherwise, the line number check is not necessary else { while (<$MEM_FH>) { if (/$keyword/) { $foundData .= $_; # append line $count++; # increment count }} } close $MEM_FH; } # gathers two elements; count and rawData in anonymous array # gather must be called irregardless if found or not found # the manager process needs to know if this chunk_id has completed # when gathering results orderly MCE->gather($chunk_id, [ $count, $foundData ]); } $fileName; MCE::Loop::finish; # shutdown MCE workers # each element in rawData is an array ref [ $count, $foundData ] # output count my $filterCount=0; $filterCount += $_->[0] for @rawData; # $count print "Completed filtering $keyword\n"; print "Found $filterCount elements\n"; # output found data my $outFileName = substr($fileName,0,length($fileName)-4)."_filter.txt"; print "Filtering to output file: $outFileName\n"; open OUTFILE, ">$outFileName" || die "cannot open file to write: $!"; print OUTFILE $header; print OUTFILE $_->[1] for @rawData; # $foundData close OUTFILE;