in reply to Re^2: Weird performance issue with Strawberries and Inline::C
in thread Weird performance issue with Strawberries and Inline::C
(OP here again, with long-forgotten PWC 342-2, exploring depths in muddy waters around 4-5 lines of trivial C)
What a pity, I only have 4 cores and can't achieve more than 4x increase, would be fun to watch
Fun should never be denied:
v5.42.0 / MSWin32-x64-multi-thread / 13.2.0 String length: 10,000 Rate/s % c 87663 100 va 622005 710 String length: 100,000 Rate/s % c 8875 100 c_omp 15439 174 va 71646 807 String length: 1,000,000 Rate/s % c 891 100 c_omp 2307 259 va 12770 1434 String length: 10,000,000 Rate/s % c 88.3 100 c_omp 361.2 409 va 1615.6 1829 String length: 100,000,000 Rate/s % c 8.8 100 c_omp 36.2 410 va 167.7 1900
'c' and 'c_omp' are for 'mscore_c' (straightforward i.e. "dumb") and 'mscore_c_omp' found in this thread. The 'va' stands for 'vectors, assembly'; where OMP (I have 4 cores) is "automatically" switched on for strings longer than 4e5. CPU is supposed to support AVX2 i.e. to be 2017+.
There's a "cheat": sequential runs of either '0' or '1's, in target string, shouldn't be longer than 2**31. Because current (running) scores are maintained (dragged along) as 8 per 256-bit registers. Perhaps there is more optimal way to find (horizontal) cumulative sums along 16 bytes, other than 4 shifts/shuffles and 4 additions. What's really funny is that to find (rather, maintain) running maximum over 32 numbers I'm only doing 3 comparisons. But maybe all of the above (i.e. below) looks not funny but ugly to "people who know how".
use Inline C => Config => ccflagsex => '-fopenmp', libs => '-lgomp -ldl'; use Inline C => << 'END_OF_C'; #include <omp.h> #define MANY 400000 #define MAX_CORES 4 void _helper_a( char* buf, size_t len, int32_t* acc, int32_t* cum, int32_t* max ); void _helper_c( char* buf, size_t len, int32_t* acc, int32_t* cum, int32_t* max ); int mscore_va( SV* str ) { STRLEN len; char* buf = SvPVbyte( str, len ); int ncores = 1; if ( len >= MANY ) { int n = omp_get_num_procs(); ncores = n > MAX_CORES ? MAX_CORES : n; } int nparts = 2 + ncores; int32_t acc_a[ nparts ]; int32_t cum_a[ nparts ]; int32_t max_a[ nparts ]; memset( acc_a, 0x00, sizeof( acc_a )); memset( cum_a, 0x00, sizeof( cum_a )); memset( max_a, 0xFF, sizeof( max_a )); len --; int32_t prefix_len = ( size_t ) buf % 32; if ( len < prefix_len ) prefix_len = len; int32_t suffix_len = ( len - prefix_len ) % 32; int32_t aligned_len = len - prefix_len - suffix_len; char* prefix_start = buf; char* aligned_start = buf + prefix_len; char* suffix_start = aligned_start + aligned_len; if ( prefix_len > 0 ) { _helper_c( prefix_start, prefix_len, acc_a, cum_a, max_a ); } if ( suffix_len > 0 ) { _helper_c( suffix_start, suffix_len, &( acc_a[ nparts - 1 ]), &( cum_a[ nparts - 1 ]), &( max_a[ nparts - 1 ])); } if ( aligned_len > 0 ) { if ( ncores == 1 ) { _helper_a( aligned_start, aligned_len, &( acc_a[ 1 ]), &( cum_a[ 1 ]), &( max_a[ 1 ])); } else { size_t psize = len / 32 / ncores * 32; #pragma omp parallel for schedule( static, 1 ) \ num_threads( ncores ) for ( int j = 0; j < ncores; j ++ ) { char* start = aligned_start + j * psize; size_t len_ = ( j < ncores - 1 ) ? psize : aligned_len + - j * psize; _helper_a( start, len_, &( acc_a[ j + 1 ]), &( cum_a[ j + 1 ]), &( max_a[ j + 1 ])); } } } int32_t acc = acc_a[ 0 ]; int32_t max = max_a[ 0 ]; for ( int j = 1; j < nparts; j ++ ) { acc += acc_a[ j ]; cum_a[ j ] += cum_a[ j - 1 ]; max_a[ j ] += cum_a[ j - 1 ]; if ( max_a[ j ] > max ) max = max_a[ j ]; } return acc + max + ( buf[ len ] == '1' ); } void _helper_c( char* buf, size_t len, // in int32_t* acc, int32_t* cum, int32_t* max ) { // out for( int i = 0; i < len; i ++ ) { int one = buf[ i ] - '0'; *acc += one; *cum += one ? -1 : 1; if ( *cum > *max ) *max = *cum; } } void _helper_a( char* buf, size_t len, // in int32_t* acc, int32_t* cum, int32_t* max ) { // out void* fin = buf + len; int32_t acc_, cum_, max_; __asm__ ( // " IDDQD \n" " XOR %[acc], %[acc] \n" " MOV $0x00, %%r11 \n" " MOVQ %%r11, %%xmm0 \n" " VPBROADCASTB %%xmm0, %%ymm0 \n" // ymm0 = cum " MOV $0xFF, %%r11 \n" " MOVQ %%r11, %%xmm1 \n" " VPBROADCASTB %%xmm1, %%ymm1 \n" // ymm1 = max " MOV $0x61, %%r11 \n" " MOVQ %%r11, %%xmm2 \n" " VPBROADCASTB %%xmm2, %%ymm2 \n" // ymm2 = 97 x 32 " MOV $0x0F, %%r11 \n" " MOVQ %%r11, %%xmm6 \n" " VPBROADCASTB %%xmm6, %%ymm6 \n" // ymm6 = 15 x 32 " MOV %[buf], %%r8 \n" " MOV %[fin], %%r9 \n" " MOV $32, %%r10 \n" ".L2_: \n" " VMOVDQA (%%r8), %%ymm3 \n" " VPADDB %%ymm3, %%ymm3, %%ymm3 \n" " VPSUBB %%ymm3, %%ymm2, %%ymm3 \n" " VPMOVMSKB %%ymm3, %%ecx \n" " POPCNT %%ecx, %%ecx \n" " ADD %%ecx, %[acc] \n" " VPSLLDQ $1, %%ymm3, %%ymm4 \n" " VPADDSB %%ymm3, %%ymm4, %%ymm3 \n" " VPSLLDQ $2, %%ymm3, %%ymm4 \n" " VPADDSB %%ymm3, %%ymm4, %%ymm3 \n" " VPSLLDQ $4, %%ymm3, %%ymm4 \n" " VPADDSB %%ymm3, %%ymm4, %%ymm3 \n" " VPSLLDQ $8, %%ymm3, %%ymm4 \n" " VPADDSB %%ymm3, %%ymm4, %%ymm3 \n" " VPSHUFB %%ymm6, %%ymm3, %%ymm4 \n" " VPSHUFD $0b01001110, %%ymm3, %%ymm5 \n" " VPMAXSB %%ymm3, %%ymm5, %%ymm3 \n" " VPMOVSXBD %%xmm3, %%ymm5 \n" " VPADDD %%ymm0, %%ymm5, %%ymm5 \n" " VPMAXSD %%ymm1, %%ymm5, %%ymm1 \n" " VPMOVSXBD %%xmm4, %%ymm5 \n" " VPADDD %%ymm0, %%ymm5, %%ymm0 \n" " VPERMQ $0b01001110, %%ymm3, %%ymm3 \n" " VPERMQ $0b01001110, %%ymm4, %%ymm4 \n" " VPMOVSXBD %%xmm3, %%ymm5 \n" " VPADDD %%ymm0, %%ymm5, %%ymm5 \n" " VPMAXSD %%ymm1, %%ymm5, %%ymm1 \n" " VPMOVSXBD %%xmm4, %%ymm5 \n" " VPADDD %%ymm0, %%ymm5, %%ymm0 \n" " ADD %%r10, %%r8 \n" " CMP %%r9, %%r8 \n" " JL .L2_ \n" " VPERMQ $0b00111001, %%ymm1, %%ymm4 \n" " VPMAXSD %%ymm1, %%ymm4, %%ymm1 \n" " VPERMQ $0b01001110, %%ymm1, %%ymm4 \n" " VPMAXSD %%ymm1, %%ymm4, %%ymm1 \n" " PSHUFD $0b00111001, %%xmm1, %%xmm4 \n" " PMAXSD %%xmm4, %%xmm1 \n" " MOVD %%xmm1, %[max] \n" " MOVD %%xmm0, %[cum] \n" : [acc] "=r" ( acc_ ), [cum] "=rm" ( cum_ ), [max] "=rm" ( max_ ) : [buf] "m" ( buf ), [fin] "m" ( fin ) : "cc", "rcx", "r8", "r9", "r10", "r11", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6" ); *acc = acc_; *cum = cum_; *max = max_; } END_OF_C
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^4: Weird performance issue with Strawberries and Inline::C
by Anonymous Monk on Nov 09, 2025 at 17:41 UTC | |
by Anonymous Monk on Nov 12, 2025 at 13:53 UTC | |
by Anonymous Monk on Nov 17, 2025 at 14:54 UTC |