use strict; use warnings; use feature 'say'; use Config; use Benchmark qw/ timeit :hireswallclock /; use lib '.'; use Max_score_subs ':all'; say "$^V / $Config{ archname } / $Config{ gccversion }"; my $str; my %tests = ( # C: c => sub { mscore_c( $str )}, # - dumb c_better => sub { mscore_c_better( $str )}, # - tweaked c_omp => sub { mscore_c_omp( $str )}, # - "c", parallel # # Fortran: c2f => sub { mscore_c2f( $str )}, # - "c_better" (in F) f => sub { mscore_f( $str )}, # - "PDL" (in F) f_omp => sub { mscore_f_omp( $str )}, # - "f", parallel ); my $iters = 2e5; my $fmt = '%8.0f'; for my $L ( 1e4, 1e5, 1e6, 1e7, 1e8 ) { say "\nString length: " . $L =~ s/(\d)(?=(\d{3})+$)/$1,/gr; $str = '1' x $L; substr $str, rand $L, 1 , '0' for 1 .. $L; my $any; my %ret; for my $key ( keys %tests ) { my $result = $tests{ $key }-> (); die "<<< $key!!! >>>" unless $result == ( $any //= $result ); my $t = timeit( $iters, $tests{ $key }); $ret{ $key } = $t-> iters / $t-> real } print " Rate/s %\n"; $fmt = '%8.1f' if $L > 1e6; for my $key ( sort { $ret{ $a } <=> $ret{ $b }} keys %ret ) { printf " %-9s $fmt %5.0f\n", $key, $ret{ $key }, 100 * $ret{ $key } / $ret{ c } } $iters /= 10 } __END__ v5.42.0 / MSWin32-x64-multi-thread / 13.2.0 String length: 10,000 Rate/s % f_omp 23804 28 c_omp 25362 29 c 86100 100 c_better 91470 106 c2f 105239 122 f 120134 140 String length: 100,000 Rate/s % c 8724 100 c_better 9324 107 c2f 10686 122 f 12139 139 c_omp 15369 176 f_omp 15964 183 String length: 1,000,000 Rate/s % c 875 100 c_better 931 106 c2f 1068 122 f 1213 139 c_omp 2306 264 f_omp 2560 293 String length: 10,000,000 Rate/s % c 86.8 100 c_better 92.2 106 c2f 104.5 120 f 115.6 133 c_omp 349.6 403 f_omp 426.9 492 String length: 100,000,000 Rate/s % c 8.6 100 c_better 9.2 106 c2f 10.4 120 f 11.4 131 c_omp 34.7 401 f_omp 42.4 491 #### int mscore_c_omp( SV* str ) { STRLEN len; char* buf = SvPVbyte( str, len ); len --; int nparts = omp_get_num_procs(); if ( nparts > 4 ) nparts = 4; int psize = len / nparts; int acc_a[ nparts ]; int cum_a[ nparts ]; int max_a[ nparts ]; #pragma omp parallel for schedule( static, 1 ) \ num_threads( nparts ) for ( int j = 0; j < nparts; j ++ ) { int lo = j * psize; int hi = ( j == nparts - 1 ) ? len : lo + psize; int acc = 0; int cum = 0; int max = -1; for ( int i = lo; i < hi; i ++ ) { int one = buf[ i ] - '0'; acc += one; cum += one ? -1 : 1; if ( cum > max ) max = cum; } acc_a[ j ] = acc; cum_a[ j ] = cum; max_a[ j ] = max; } int acc = acc_a[ 0 ]; int max = max_a[ 0 ]; for ( int j = 1; j < nparts; j ++ ) { acc += acc_a[ j ]; cum_a[ j ] += cum_a[ j - 1 ]; max_a[ j ] += cum_a[ j - 1 ]; if ( max_a[ j ] > max ) max = max_a[ j ]; } return acc + max + ( buf[ len ] == '1' ); } #### subroutine f_omp( n, s, ret ) integer, intent( in ) :: n integer * 1, intent( in ) :: s( n ) integer, intent( out ) :: ret integer :: j, nparts, psize integer, allocatable :: acc_a ( : ), cum_a( : ), min_a( : ) nparts = omp_get_num_procs() if ( nparts > 4 ) nparts = 4 psize = n / nparts; allocate( acc_a( nparts )) allocate( cum_a( nparts )) allocate( min_a( nparts )) !$omp parallel do schedule( static, 1 ) num_threads( nparts ) do j = 1, nparts ; block integer * 1, allocatable :: a( : ) integer, allocatable :: cum( : ) integer :: lo, hi, d, nchunks integer :: pre, L, i, k integer :: acc_, cum_, min_ lo = ( j - 1 ) * psize if ( j == nparts ) then hi = n - 1 else hi = j * psize end if d = hi - lo nchunks = d / CHUNK if ( mod( d, CHUNK ) .ne. 0 ) nchunks = nchunks + 1 acc_ = 0; cum_ = 0; min_ = 2; allocate( a( CHUNK )) allocate( cum( CHUNK )) do k = 1, nchunks pre = lo + ( k - 1 ) * CHUNK L = min( CHUNK, hi - pre ) a( 1 : L ) = 2 * s( pre + 1 : pre + L ) - A97 cum( 1 ) = cum_ + a( 1 ) do i = 2, L cum( i ) = cum( i - 1 ) + a( i ) end do acc_ = acc_ + count( s( pre + 1 : pre + L ) & == iachar( '1', 1 )) cum_ = cum( L ) min_ = min( min_, minval( cum( 1 : L ))) end do acc_a( j ) = acc_ cum_a( j ) = cum_ min_a( j ) = min_ end block ; end do !$omp end parallel do do j = 2, nparts cum_a( j ) = cum_a( j ) + cum_a( j - 1 ) min_a( j ) = min_a( j ) + cum_a( j - 1 ) end do ret = sum( acc_a ) + ( s( n ) - iachar( '0', 1 )) & - minval( min_a ) end subroutine f_omp