my %completed :shared = (); # Record completed datast
my %results_global :shared = (); # Global; holds intersections
my @jobs :shared = (); # Holds thread IDs
# Since I know what to call the different subsets up front, I'll pre-
# share them. In my experience, this is *much* faster than calling
# shared_clone({}) for hash values that are deeply branching HREFs.
foreach my $outer (@data_entire) {
$results{$outer} = &share({});
foreach my $inner (@data_entire) {
$results{$outer}->{$inner} = &share([]); # Hold AREFs
}
} #OUTER
####
# Partition data across 3 threads (4 AREFs/ea) and spawn thread jobs.
# Each subset is an AREF of indices in @data_entire.
my @data_entire :shared = ($aref1, $aref2, ... $aref12);
my @partitions :shared = ($subset1, $subset2, $subset3);
foreach(@partitions) {
my $tid = threads->new(\&XYZ, $_); # Create thread
push(@jobs, $tid); # Store $tid
}
$_->join() foreach (@jobs);
===================== In a thread-local sub() =======================
sub XYZ {
my @data_subset = @data_entire[@{ $_ }]; # thread's chunk
my %results = ();
OUTER: foreach $outer (@data_subset) {
retrieve $outer;
INNER: foreach $inner (@data_entire) {
next INNER if($inner == $outer) # skip self
next INNER if(exists $completed{$inner}; # already done
retrieve $inner; # EXPENSIVE
# Return two AREFs
my ($result_oi, $result_io)
= set_intersection($outer, $inner);
# Store to thread-local variable
$results_local{$outer}->{$inner} = $result_oi;
$results_local{$inner}->{$outer} = $result_io;
}
{
lock(%completed); # Update global
$completed{$outer} = ();
}
}
# By updating the globally shared hash of the local results all
# at once and just before this spawned thread returns to the main
# thread, the write contention time to %results_global is reduced
# significantly. In practice, this made a HUGE difference and
# allowed me to use 80 hyperthreaded cores @ 100% (40 physical),
# whereas updating %results_global after each set_intersection()
# reduced it to maybe 13 cores @ 100% usage (this is on a single
# memory machine).
#
# Update globally shared hash of results with local results
{
lock (%results_global);
foreach my $outer (keys %results_local) {
$results_global{$outer}->{$_} = $results_local{$outer}->{$_}
foreach (keys %{ $results_local{$outer} });
} #OUTER
}
return;
}
=====================================================================
####
(85,750,000 sec)/(80 cores)*(1hr/3600sec)*(1day/24hrs)
= 1,071,875 sec/core
= 298 hrs/core
= 12 days/core <--- Walltime for this op.