Thanks for this. It prompted me to investigate using Inline::C and was the starting point for my chosen solution.
The results of benchmarking various implementations in Perl and Inline::C are
# a few bits grouped in the LSBs of the unsigned long Rate 1 4 3 2 4b 1b 4a 1a 7 6 5 7a 1 349/s -- -17% -47% -50% -52% -53% -53% -55% -75% -76% -81% -86% 4 420/s 20% -- -36% -40% -42% -43% -44% -46% -70% -71% -77% -83% 3 657/s 88% 56% -- -6% -9% -11% -12% -15% -53% -55% -64% -74% 2 696/s 99% 66% 6% -- -4% -6% -6% -10% -50% -52% -62% -72% 4b 724/s 108% 73% 10% 4% -- -2% -3% -6% -48% -50% -61% -71% 1b 740/s 112% 76% 13% 6% 2% -- -1% -4% -47% -49% -60% -71% 4a 745/s 113% 77% 13% 7% 3% 1% -- -4% -46% -49% -59% -71% 1a 772/s 121% 84% 18% 11% 7% 4% 4% -- -44% -47% -58% -70% 7 1386/s 297% 230% 111% 99% 91% 87% 86% 80% -- -4% -24% -45% 6 1449/s 315% 245% 121% 108% 100% 96% 95% 88% 5% -- -21% -43% 5 1834/s 425% 337% 179% 163% 153% 148% 146% 138% 32% 27% -- -28% 7a 2532/s 625% 503% 285% 264% 249% 242% 240% 228% 83% 75% 38% -- # a few bits grouped in the MSBs of the unsigned long Rate 1b 1a 4b 4a 2 1 4 5 3 6 7 7a 1b 256/s -- -2% -4% -7% -9% -24% -39% -60% -61% -81% -81% -90% 1a 260/s 2% -- -2% -5% -8% -23% -38% -59% -61% -81% -81% -90% 4b 266/s 4% 2% -- -3% -5% -21% -36% -58% -60% -80% -80% -90% 4a 274/s 7% 5% 3% -- -3% -19% -34% -57% -59% -80% -80% -89% 2 282/s 10% 8% 6% 3% -- -17% -33% -56% -57% -79% -79% -89% 1 338/s 32% 30% 27% 23% 20% -- -19% -47% -49% -75% -75% -87% 4 417/s 63% 60% 57% 52% 48% 23% -- -35% -37% -69% -69% -84% 5 638/s 149% 146% 140% 133% 127% 89% 53% -- -4% -53% -53% -75% 3 662/s 159% 155% 149% 142% 135% 96% 59% 4% -- -51% -51% -74% 6 1353/s 429% 421% 408% 394% 381% 300% 224% 112% 104% -- -1% -47% 7 1361/s 432% 424% 411% 397% 383% 303% 226% 113% 105% 1% -- -47% 7a 2576/s 906% 891% 867% 841% 815% 662% 517% 304% 289% 90% 89% --
1a) & 1b) are minor variations.
This performs surprisingly well for small numbers of bits in the LSBs of the scalar.
7a is the above, but pushing the results on the XS stack rather than creating and filling a AV* which has to be dereferenced in the calling code.
The affect of that 6 to 9 times improvement in the performance of this relatively simple code is that 14% has dropped to 1% to 2%, and so cut 2 1/2 to 3 hours off the overall runtime. Many thanks.
My chosen solution:
use Inline C => <<'__C_CODE__'; static const char LogTable256[] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; unsigned log2i( unsigned int v ) { // 32-bit word to find the log of register unsigned int t, tt; // temporaries if (tt = v >> 16) { return( ( t = tt >> 8 ) ? 24 + LogTable256[ t ] : 16 + LogTable256[ tt ] ); } else { return( ( t = v >> 8 ) ? 8 + LogTable256[ t ] : LogTable256[ v ] ); } } void b2i7a( unsigned int value ) { register int i = log2i( (value-1) ^ value ); Inline_Stack_Vars; Inline_Stack_Reset; for( value >>= i; value; i++, value >>= 1 ) if( value & 1 ) Inline_Stack_Push( sv_2mortal( newSVuv( i ) ) ); Inline_Stack_Done; return; } __C_CODE__
Comments, improvements & warnings welcomed.
In reply to Re^4: bit pattern to bit position mapping
by Anonymous Monk
in thread bit pattern to bit position mapping
by Anonymous Monk
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |