# Partition data across 3 threads (4 AREFs/ea) and spawn thread jobs. # Each subset is an AREF of indices in @data_entire. my @data_entire :shared = ($aref1, $aref2, ... $aref12); my @partitions :shared = ($subset1, $subset2, $subset3); foreach(@partitions) { my $tid = threads->new(\&XYZ, $_); # Create thread push(@jobs, $tid); # Store $tid } $_->join() foreach (@jobs); ===================== In a thread-local sub() ======================= sub XYZ { my @data_subset = @data_entire[@{ $_ }]; # thread's chunk my %results = (); OUTER: foreach $outer (@data_subset) { retrieve $outer; INNER: foreach $inner (@data_entire) { next INNER if($inner == $outer) # skip self next INNER if(exists $completed{$inner}; # already done retrieve $inner; # EXPENSIVE # Return two AREFs my ($result_oi, $result_io) = set_intersection($outer, $inner); # Store to thread-local variable $results_local{$outer}->{$inner} = $result_oi; $results_local{$inner}->{$outer} = $result_io; } { lock(%completed); # Update global $completed{$outer} = (); } } # By updating the globally shared hash of the local results all # at once and just before this spawned thread returns to the main # thread, the write contention time to %results_global is reduced # significantly. In practice, this made a HUGE difference and # allowed me to use 80 hyperthreaded cores @ 100% (40 physical), # whereas updating %results_global after each set_intersection() # reduced it to maybe 13 cores @ 100% usage (this is on a single # memory machine). # # Update globally shared hash of results with local results { lock (%results_global); foreach my $outer (keys %results_local) { $results_global{$outer}->{$_} = $results_local{$outer}->{$_} foreach (keys %{ $results_local{$outer} }); } #OUTER } return; } =====================================================================