# Partition data across 3 threads (4 AREFs/ea) and spawn thread jobs.
# Each subset is an AREF of indices in @data_entire.
my @data_entire :shared = ($aref1, $aref2, ... $aref12);
my @partitions  :shared = ($subset1, $subset2, $subset3);
foreach(@partitions) {
    my $tid = threads->new(\&XYZ, $_);              # Create thread
    push(@jobs, $tid);                              # Store $tid
}
$_->join() foreach (@jobs);

                                                                     
===================== In a thread-local sub() =======================
sub XYZ {

    my @data_subset = @data_entire[@{ $_ }];         # thread's chunk
    my %results = ();
    OUTER: foreach $outer (@data_subset) {
        retrieve $outer;
        INNER: foreach $inner (@data_entire) {
            next INNER if($inner == $outer)          # skip self
            next INNER if(exists $completed{$inner}; # already done
    
            retrieve $inner;                         # EXPENSIVE
            
            # Return two AREFs
            my ($result_oi, $result_io) 
                = set_intersection($outer, $inner);

            # Store to thread-local variable
            $results_local{$outer}->{$inner} = $result_oi;
            $results_local{$inner}->{$outer} = $result_io;
    
        }
        {
            lock(%completed);                        # Update global
            $completed{$outer} = ();
        }
    }
    
    # By updating the globally shared hash of the local results all 
    # at once and just before this spawned thread returns to the main
    # thread, the write contention time to %results_global is reduced
    # significantly. In practice, this made a HUGE difference and 
    # allowed me to use 80 hyperthreaded cores @ 100% (40 physical),
    # whereas updating %results_global after each set_intersection()
    # reduced it to maybe 13 cores @ 100% usage (this is on a single
    # memory machine).
    #
    # Update globally shared hash of results with local results
    {
      lock (%results_global);
      foreach my $outer (keys %results_local) {
        $results_global{$outer}->{$_} = $results_local{$outer}->{$_} 
          foreach (keys %{ $results_local{$outer} });
      } #OUTER
    }
    
    return;
}
=====================================================================