use strict;
use warnings;
use feature 'say';
use Config;
use Benchmark qw/ timeit :hireswallclock /;

use lib '.';
use Max_score_subs ':all';

say "$^V / $Config{ archname } / $Config{ gccversion }";

my $str;
my %tests = (
                                                #   C:
    c        => sub { mscore_c( $str )},        # - dumb
    c_better => sub { mscore_c_better( $str )}, # - tweaked
    c_omp    => sub { mscore_c_omp( $str )},    # - "c", parallel
                                                #
                                                #   Fortran:
    c2f      => sub { mscore_c2f( $str )},      # - "c_better" (in F)
    f        => sub { mscore_f( $str )},        # - "PDL" (in F)
    f_omp    => sub { mscore_f_omp( $str )},    # - "f", parallel
);

my $iters = 2e5;
my $fmt = '%8.0f';

for my $L ( 1e4, 1e5, 1e6, 1e7, 1e8 ) {
    say "\nString length: " . $L =~ s/(\d)(?=(\d{3})+$)/$1,/gr;
    
    $str = '1' x $L;
    substr $str, rand $L, 1 , '0' for 1 .. $L;

    my $any;
    my %ret;
    for my $key ( keys %tests ) {
        my $result = $tests{ $key }-> ();
        die "<<< $key!!! >>>" unless $result == ( $any //= $result );
        
        my $t = timeit( $iters, $tests{ $key });
        $ret{ $key } = $t-> iters / $t-> real
    }
    
    print "             Rate/s    %\n";
    $fmt = '%8.1f' if $L > 1e6;
    for my $key ( sort { $ret{ $a } <=> $ret{ $b }} keys %ret ) {
        printf " %-9s $fmt %5.0f\n", 
            $key, $ret{ $key }, 100 * $ret{ $key } / $ret{ c }
    }
    $iters /= 10
}

__END__

v5.42.0 / MSWin32-x64-multi-thread / 13.2.0

String length: 10,000
             Rate/s    %
 f_omp        23804    28
 c_omp        25362    29
 c            86100   100
 c_better     91470   106
 c2f         105239   122
 f           120134   140

String length: 100,000
             Rate/s    %
 c             8724   100
 c_better      9324   107
 c2f          10686   122
 f            12139   139
 c_omp        15369   176
 f_omp        15964   183

String length: 1,000,000
             Rate/s    %
 c              875   100
 c_better       931   106
 c2f           1068   122
 f             1213   139
 c_omp         2306   264
 f_omp         2560   293

String length: 10,000,000
             Rate/s    %
 c             86.8   100
 c_better      92.2   106
 c2f          104.5   120
 f            115.6   133
 c_omp        349.6   403
 f_omp        426.9   492

String length: 100,000,000
             Rate/s    %
 c              8.6   100
 c_better       9.2   106
 c2f           10.4   120
 f             11.4   131
 c_omp         34.7   401
 f_omp         42.4   491


##</code><code>##

int mscore_c_omp( SV* str ) {
    STRLEN len;
    char* buf = SvPVbyte( str, len );
    len --;
    
    int nparts = omp_get_num_procs();
    if ( nparts > 4 ) nparts = 4;
    int psize = len / nparts;
    
    int acc_a[ nparts ];
    int cum_a[ nparts ];
    int max_a[ nparts ];
    
    #pragma omp parallel for schedule( static, 1 ) \
                             num_threads( nparts )
    for ( int j = 0; j < nparts; j ++ ) {
        int lo = j * psize;
        int hi = ( j == nparts - 1 ) ? len : lo + psize;
        
        int acc = 0;
        int cum = 0;
        int max = -1;
        
        for ( int i = lo; i < hi; i ++ ) {
            int one = buf[ i ] - '0';
            acc += one;
            cum += one ? -1 : 1;
            if ( cum > max ) max = cum;
        }
        acc_a[ j ] = acc;
        cum_a[ j ] = cum;
        max_a[ j ] = max;
    }
    
    int acc = acc_a[ 0 ]; 
    int max = max_a[ 0 ];
    
    for ( int j = 1; j < nparts; j ++ ) {
        acc += acc_a[ j ];
        
        cum_a[ j ] += cum_a[ j - 1 ];
        max_a[ j ] += cum_a[ j - 1 ];
        
        if ( max_a[ j ] > max ) max = max_a[ j ];
    }
    
    return acc + max + ( buf[ len ] == '1' );
}


##</code><code>##

subroutine f_omp( n, s, ret )
    integer,     intent( in )  :: n
    integer * 1, intent( in )  :: s( n )
    integer,     intent( out ) :: ret
    
    integer              :: j, nparts, psize
    integer, allocatable :: acc_a ( : ), cum_a( : ), min_a( : )
    
    nparts = omp_get_num_procs()
    if ( nparts > 4 ) nparts = 4
    psize = n / nparts;
    
    allocate( acc_a( nparts ))
    allocate( cum_a( nparts ))
    allocate( min_a( nparts ))
    
    !$omp parallel do schedule( static, 1 ) num_threads( nparts )
    do j = 1, nparts ; block
        integer * 1, allocatable :: a( : )
        integer, allocatable     :: cum( : )
        
        integer :: lo, hi, d, nchunks
        integer :: pre, L, i, k
        integer :: acc_, cum_, min_
        
        lo = ( j - 1 ) * psize
        if ( j == nparts ) then
            hi = n - 1
        else
            hi = j * psize
        end if
        d = hi - lo
        nchunks = d / CHUNK
        if ( mod( d, CHUNK ) .ne. 0 ) nchunks = nchunks + 1
        
        acc_ = 0;
        cum_ = 0;
        min_ = 2;
        
        allocate( a( CHUNK ))
        allocate( cum( CHUNK ))
        
        do k = 1, nchunks
            pre = lo + ( k - 1 ) * CHUNK
            L = min( CHUNK, hi - pre )
            
            a( 1 : L ) = 2 * s( pre + 1 : pre + L ) - A97
            cum( 1 ) = cum_ + a( 1 )
            do i = 2, L
                cum( i ) = cum( i - 1 ) + a( i )
            end do
            
            acc_ = acc_ + count( s( pre + 1 : pre + L ) &
                                == iachar( '1', 1 ))
            cum_ = cum( L )
            min_ = min( min_, minval( cum( 1 : L )))
        end do
        
        acc_a( j ) = acc_
        cum_a( j ) = cum_
        min_a( j ) = min_
        
    end block ; end do
    !$omp end parallel do
    
    do j = 2, nparts
        cum_a( j ) = cum_a( j ) + cum_a( j - 1 )
        min_a( j ) = min_a( j ) + cum_a( j - 1 )
    end do
    
    ret = sum( acc_a ) + ( s( n ) - iachar( '0', 1 )) &
          - minval( min_a )
end subroutine f_omp