use constant { POSTED => 0, VOTED => 1, RATIO => 2 }; ## Accumulate users only if they voted, and count their votes. my %users; open BIGFILE; while( ) { my( $user, $thread, $voted ) = split; ++$users{ $user } if $voted; } close BIGFILE; ## Discard all users who have voted less times than a sensible threshold. $users{ $_ } < MIN_VOTES_THRESHOLD and delete $users{ $_ } for keys %users; ## Re-scan the file, accumulating counts of posts and votes ## *for those userids remaining in %users only* ## Assumes file ordered by threadid. my %pairs; open BIGFILE; my( $user, $thread, $voted ) = split ' ', ; my $lastThread = $thread; MAINLOOP: while( 1 ) { my @users; while( $thread == $lastThread ) { ## Accumulate users/votes in each thread push @users, "$user:$voted"; ( $user, $thread, $voted ) = split ' ', ; last MAINLOOP if eof( BIGFILE ); }; $lastThread = $thread; ## Permute them to generate pairs for my $pair ( Cnr 2, @users ) { my( $user1, $voted1 ) = split ':', $pair->[ 0 ]; my( $user2, $voted2 ) = split ':', $pair->[ 1 ]; ## Skip if either is not in the 'high voters' list next unless exists $users{ $user1 } and exists $users{ $user2 }; ## Otherwise increment the coincident pair count (and vote if applicable). my $pair = pack 'LL', $user1,$user2; ++$pairs{ $pair }[ POSTED ]; ++$pairs{ $pair }[ VOTED ] if $voted1; } } ## Scan the pairs generating a ratio of votes to posts. my( $totalRatio, $maxRatio ) = ( 0 ) x 2; for ( keys %pairs ) { my $pair = $pairs{ $_ }; $pair->[ RATIO ] = ( $pair->[ VOTED ]||0 ) / $pair->[ POSTED ]; $totalRatio += $pair->[ RATIO ]; $maxRatio = $pair->[ RATIO ] if $maxRatio < $pair->[ RATIO ]; } ## The average ratio of pairwise votes to posts might form the basis for discrimination my $averageRatio = $totalRatio / ( keys %pairs||1 ); printf "The voted/posted ratios averaged to %f; with a maximum of %f\n", $averageRatio, $maxRatio; ## Display those pairs with a vote/post ratio above teh threshold. $pairs{ $_ }[ RATIO ] > POST_VOTE_THRESHOLD and print "Pair @{[ unpack 'LL', $_ ]} had a vote/post ratio of @{ $pairs{ $_ } }[ POSTED, VOTED, RATIO ]" for keys %pairs;