. Instead of a simple insertion sort on the top list, you could do a binary insertion sort. This starts to pay off as the value N increases.
I made a new benchmark with topN (by
, with a small fix I added to put back the first short-circuit test), topNbs (based on the same code but with a binary search to find the insert point),
#!/usr/bin/perl
use strict;
use warnings;
use List::Util qw( reduce );
use Benchmark qw( cmpthese );
use Algorithm::Numerical::Shuffle qw( shuffle );
use Inline C => 'DATA';
my %code = (
topN => sub {
my ( $n, $list ) = @_;
return topN($n, $list);
},
topNbs => sub {
my ( $n, $list ) = @_;
return topNbs($n, $list);
},
baseline => sub {
my ( $n, $list ) = @_;
return ( sort { $a <=> $b } @$list )[ @$list - $n .. $#$list ]
+;
},
limbic => sub {
my ($x, $list) = @_;
$x--;
my @top;
$#top = $x;
for my $item ( @$list ) {
next if defined $top[ -1 ] && $item <= $top[ -1 ];
for my $id ( 0 .. $#top ) {
$top[ $id ] = $item and last if ! defined $top[ $id ];
if ( $item > $top[ $id ] ) {
@top[ $id .. $#top ] = ($item, @top[ $id .. $#top
+- 1]);
last;
}
}
}
return @top;
},
browseruk => sub {
my( $n, $aref ) = @_;
my @topN;
push @topN, reduce{
$a > $b && (!@topN || $a < $topN[ -1 ] )
? $a : ( !@topN || $b < $topN[ -1 ] )
? $b : $a;
} @$aref for 1 .. $n;
return @topN;
},
aristotle => sub {
my ( $n, $list ) = @_;
my @top = @$list[ 0 .. $n - 1 ];
@top = ( sort { $a <=> $b } $_, @top )[ 1 .. $n ] for @$list[
+$n .. $#$list ];
return @top;
},
);
my @bench = (
[ qw/ 10 5 / ],
[ qw/ 100 5 / ],
[ qw/ 1000 5 / ],
[ qw/ 10000 5 / ],
[ qw/ 100000 5 / ],
[ qw/ 100 50 / ],
[ qw/ 1000 50 / ],
[ qw/ 10000 50 / ],
[ qw/ 100000 50 / ],
[ qw/ 1000 500 / ],
[ qw/ 10000 500 / ],
[ qw/ 100000 500 / ],
);
$|++;
while( @bench ) {
my ( $max, $n ) = @{ shift @bench };
my $duration = sprintf "%.2g", ( log( $max ) / log( 10 ) ) ** 2;
print "\nLooking for top $n in $max (running for $duration CPU sec
+s)\n";
my @values = 1 .. $max;
my @values_mixed = shuffle(@values);
my @top = ( sort { $a <=> $b } @values )[ @values - $n .. $#values
+ ];
for( keys %code ) {
my @result = sort { $a <=> $b } $code{ $_ }->( $n, \@values_mi
+xed );
die "$_ not ok: [@result] ne [@top]\n" if "@result" ne "@top";
}
cmpthese -$duration => {
map { my $x = $code{ $_ }; $_ => sub { my @x = $x->( $n, \@val
+ues_mixed ) } } keys %code
};
}
__END__
__C__
void topN( int n, AV*data ) {
int *topN;
int len = av_len( data );
int i, j, k;
Inline_Stack_Vars;
Newz( 1, topN, n + 1, int );
for( i = 0; i <= len; i++ ) {
int val = SvIV( *av_fetch( data, i, 0 ) );
if (val <= topN[ n - 1]) continue;
for( j = 0; j < n; j++ ) {
if( topN[ j ] < val ) {
for( k = n; k > j; k-- ) topN[ k ] = topN[ k-1 ];
topN[ j ] = val;
break;
}
}
}
Inline_Stack_Reset;
for( i = 0; i < n; i++ )
Inline_Stack_Push( sv_2mortal( newSViv( topN[ i ] ) ) );
Safefree( topN );
Inline_Stack_Done;
}
void topNbs( int n, AV*data ) {
int *topN;
int len = av_len( data );
int i, j, k;
int left, right;
Inline_Stack_Vars;
Newz( 1, topN, n + 1, int );
for( i = 0; i <= len; i++ ) {
int val = SvIV( *av_fetch( data, i, 0 ) );
if (val <= topN[ n - 1]) continue;
left = 0;
right = n - 1;
while (left < right) {
int middle = (left + right) >> 1;
if (val <= topN[middle]) {
left = middle + 1;
} else {
right = middle;
}
}
for( k = n; k > left; k-- ) topN[ k ] = topN[ k-1 ];
topN[ left ] = val;
}
Inline_Stack_Reset;
for( i = 0; i < n; i++ )
Inline_Stack_Push( sv_2mortal( newSViv( topN[ i ] ) ) );
Safefree( topN );
Inline_Stack_Done;
}
As you can see, the topNbs starts to pay off when we need the top 500 or so. For the top 5, topN is better.