# a few bits grouped in the LSBs of the unsigned long
Rate 1 4 3 2 4b 1b 4a 1a 7 6 5 7a
1 349/s -- -17% -47% -50% -52% -53% -53% -55% -75% -76% -81% -86%
4 420/s 20% -- -36% -40% -42% -43% -44% -46% -70% -71% -77% -83%
3 657/s 88% 56% -- -6% -9% -11% -12% -15% -53% -55% -64% -74%
2 696/s 99% 66% 6% -- -4% -6% -6% -10% -50% -52% -62% -72%
4b 724/s 108% 73% 10% 4% -- -2% -3% -6% -48% -50% -61% -71%
1b 740/s 112% 76% 13% 6% 2% -- -1% -4% -47% -49% -60% -71%
4a 745/s 113% 77% 13% 7% 3% 1% -- -4% -46% -49% -59% -71%
1a 772/s 121% 84% 18% 11% 7% 4% 4% -- -44% -47% -58% -70%
7 1386/s 297% 230% 111% 99% 91% 87% 86% 80% -- -4% -24% -45%
6 1449/s 315% 245% 121% 108% 100% 96% 95% 88% 5% -- -21% -43%
5 1834/s 425% 337% 179% 163% 153% 148% 146% 138% 32% 27% -- -28%
7a 2532/s 625% 503% 285% 264% 249% 242% 240% 228% 83% 75% 38% --
# a few bits grouped in the MSBs of the unsigned long
Rate 1b 1a 4b 4a 2 1 4 5 3 6 7 7a
1b 256/s -- -2% -4% -7% -9% -24% -39% -60% -61% -81% -81% -90%
1a 260/s 2% -- -2% -5% -8% -23% -38% -59% -61% -81% -81% -90%
4b 266/s 4% 2% -- -3% -5% -21% -36% -58% -60% -80% -80% -90%
4a 274/s 7% 5% 3% -- -3% -19% -34% -57% -59% -80% -80% -89%
2 282/s 10% 8% 6% 3% -- -17% -33% -56% -57% -79% -79% -89%
1 338/s 32% 30% 27% 23% 20% -- -19% -47% -49% -75% -75% -87%
4 417/s 63% 60% 57% 52% 48% 23% -- -35% -37% -69% -69% -84%
5 638/s 149% 146% 140% 133% 127% 89% 53% -- -4% -53% -53% -75%
3 662/s 159% 155% 149% 142% 135% 96% 59% 4% -- -51% -51% -74%
6 1353/s 429% 421% 408% 394% 381% 300% 224% 112% 104% -- -1% -47%
7 1361/s 432% 424% 411% 397% 383% 303% 226% 113% 105% 1% -- -47%
7a 2576/s 906% 891% 867% 841% 815% 662% 517% 304% 289% 90% 89% --
- is the original grep solution I posted.
1a) & 1b) are minor variations.
- is a Perl implementation of your C algorithm.
- is 1 with the loop unrolled.
- (& 4a & 4b) use grep with pack and vec.
- is a custom evaled Perl subroutine that unrolls the grep loop and uses log2( value ) to avoid testing the higher bits.
This performs surprisingly well for small numbers of bits in the LSBs of the scalar.
- is your Inline::C implementation.
- is an Inline::C version that uses (a custom C) log2( value ) to skip over the high bits.
7a is the above, but pushing the results on the XS stack rather than creating and filling a AV* which has to be dereferenced in the calling code.
The affect of that 6 to 9 times improvement in the performance of this relatively simple code is that 14% has dropped to 1% to 2%, and so cut 2 1/2 to 3 hours off the overall runtime. Many thanks.
My chosen solution:
use Inline C => <<'__C_CODE__';
static const char LogTable256[] = {
0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
};
unsigned log2i( unsigned int v ) { // 32-bit word to find the log of
register unsigned int t, tt; // temporaries
if (tt = v >> 16) {
return( ( t = tt >> 8 )
? 24 + LogTable256[ t ]
: 16 + LogTable256[ tt ] );
}
else {
return( ( t = v >> 8 )
? 8 + LogTable256[ t ]
: LogTable256[ v ] );
}
}
void b2i7a( unsigned int value ) {
register int i = log2i( (value-1) ^ value );
Inline_Stack_Vars;
Inline_Stack_Reset;
for( value >>= i; value; i++, value >>= 1 )
if( value & 1 )
Inline_Stack_Push( sv_2mortal( newSVuv( i ) ) );
Inline_Stack_Done;
return;
}
__C_CODE__
Comments, improvements & warnings welcomed. |