Hello pmpmmpmp and welcome to the monastery.
Update: Updated the demonstration. The karlhash function requires MCE 1.828 to be released soon.
After responding to karlgoethebier's post, realized that I wasn't incrementing the $bc_pair_num field. Running parallel may become important when the operation is complex and requires extra CPU time which is not the case here. For learning reasons, made this demo simply to better understand parallelization for this use case. Thank you. Also credits to several monks whom have responded with elegant solutions, which I borrowed in making this demonstration.
# to run
perl demo.pl haukex
perl demo.pl karlary
perl demo.pl karlseq
Today, enjoyed writing Perl code.
#!/usr/bin/env perl
# Re^2: search for particular elements of hash with multiple values
# http://www.perlmonks.org/?node_id=1188488
use strict;
use warnings;
use feature qw ( say );
use MCE::Loop;
use constant {
_FOWPRIM => 0, _REVPRIM => 1, _BC_PAIR => 2, _BC_PNUM => 3
};
# Note: The karlhash function requires MCE 1.828 to be released soon.
sub usage {
warn "usage: $0 ( haukex | karlary | karlhash | karlseq )\n\n";
exit 1;
}
my $func_name = shift || usage();
usage() unless __PACKAGE__->can($func_name);
my $max = 1000000;
my $data =
[ 'AGCTCGTTGTTCGATCCA', 'GAGAGATAGATGATAGTG', 'TTTT_CCCC', 0 ];
our %barcode_hash = map { $_ => $data } 1 .. $max - 2;
$barcode_hash{ ($max - 1) } =
[ 'AGCTCGTTGTTCGATCCA', 'GAGAGATAGATGATAGTG', 'TTTT_AAAA', 0 ];
$barcode_hash{ ($max) } =
[ 'AGCTCGTTGTTCGATCCA', 'GAGAGATAGATGATAGTG', 'TTTT_AAAA', 0 ];
our $barcode_pair_35 = 'TTTT_AAAA';
$| = 1;
{
no strict qw ( refs );
my @found = $func_name->();
# increment the bc_pair_num field
$barcode_hash{$_}[_BC_PNUM]++ for @found;
# display summary and found keys
printf "Found %d keys\n", scalar @found;
say join ', ', sort { $a <=> $b } @found;
}
exit(0);
sub haukex {
# Serial code.
my @found;
for my $key (1 .. $max) {
if ($barcode_hash{$key}[_BC_PAIR] eq $barcode_pair_35) {
say "Found at $key";
push @found, $key;
}
}
return @found;
}
sub karlary {
# Workers receive next array chunk.
MCE::Loop::init {
max_workers => 4
};
my @found = mce_loop {
my ($mce, $chunk_ref, $chunk_id) = @_;
for my $key ( @$chunk_ref ) {
if ($barcode_hash{$key}[_BC_PAIR] eq $barcode_pair_35) {
MCE->say("Found at $key, chunk_id: $chunk_id");
MCE->gather($key);
}
}
}
1 .. $max;
MCE::Loop::finish();
return @found;
}
sub karlhash {
# Workers receive next array chunk.
MCE::Loop::init {
max_workers => 4
};
# Must have MCE 1.828 or later to run this demonstration.
# See http://www.perlmonks.org/?node_id=1188593 for a faster
# solution using hashref as input by spawning workers early.
# That will run better on the Windows platform.
my @found = mce_loop {
my ($mce, $chunk_ref, $chunk_id) = @_;
for my $key ( keys %{ $chunk_ref } ) {
if ($chunk_ref->{$key}[_BC_PAIR] eq $barcode_pair_35) {
MCE->say("Found at $key, chunk_id: $chunk_id");
MCE->gather($key);
}
}
}
\%barcode_hash;
MCE::Loop::finish();
return @found;
}
sub karlseq {
# Workers receive next sequence 'begin' and 'end' boundaries.
MCE::Loop::init {
max_workers => 4, bounds_only => 1
};
# The bounds_only option applies to sequence which means for
# workers to compute the begin and end boundaries only and
# not the items in between. The result is lesser overhead.
my @found = mce_loop_s {
my ($mce, $chunk_ref, $chunk_id) = @_;
for my $key ($chunk_ref->[0] .. $chunk_ref->[1]) {
if ($barcode_hash{$key}[_BC_PAIR] eq $barcode_pair_35) {
MCE->say("Found at $key, chunk_id: $chunk_id");
MCE->gather($key);
}
}
}
1, $max;
MCE::Loop::finish();
return @found;
}
Regards, Mario.