Compile and possibly run cuda code on the GPU via Perl's Inline::C

Here is my attempt to run Nvidia's cuda code on the GPU via Perl. The motivation is from question Perl GPGPU Modules by kcott (interesting problem, Ken!). The main tool is Inline::C which opens so many doors.

Cuda is a programming language on top (or extending) C which deals with GPGPU. General-purpose computing on graphics processing units (GPGPU) tries to use graphics cards (GPU) and their highly parallel architecture to run tasks, like (large) matrix multiplication, which a CPU, because of its architecture, runs much slower and inefficiently. The GPU is designed for matrix multiplications and that's what it does frame after frame of what we see on our monitor without sweat. Matrix multiplication is the basis for a lot of numerical applications and can make social planning much easier.

First the problems:

Nvidia provides its own compiler for cuda code: nvcc which compiles only cuda-specific code and the rest is delegated to the system compiler (e.g. gcc). So, as I understand it, both nvcc and gcc will be used to compile a cuda program which does have cuda extensions, i.e. it is not just "plain" C code.
Nvidia is very picky about the version of the system compiler. It usually only supports older compilers which must live in your system along with the system/current compiler. That's a bit of a kerfuffle. For example nvcc 11.4 supports up to gcc10, whereas my system compiler is at 11.1. My Linux system does not support installing other compilers via the package manager, or at least I did not find out how. Instead I resorted in building an older version from source with its own --prefix e.g. gcc84 and use that for each nvcc call using nvcc --compiler-bindir /usr/local/gcc84/bin/gcc84. Linking using nvcc requires the same treatment. See this for how to compile and install a second compiler in Linux with its own name-prefix-postfix.
nvcc does not take all the flags and parameters gcc takes. Instead, any flag to be passed on to the system compiler must be preceded by -Xcompiler
nvcc needs its input files to have the extension .cu

Here is the general setup:

Use Inline::C, which is a great and powerful module! thanks!, with specific compiler and linker by providing it (via use Inline C => Config => cc => '...', ld => '...') with two Perl scripts namely nvcc-compile.pl and nvcc-link.pl These will remove some incompatible compile/link flags which Inline::C and ExtUtils::MakeMaker use for compiling plain C code. They will also prefix others with -Xcompile ... to pass them on to the system compiler. The first script will also rename one of the temporary files produced in _Inline/build/ directory so that its extension is .cu and not .c. Then compiler and linker scripts proceed in running the actual nvcc command appropriate for compiling or linking. These scripts just worked for me but will probably need tweaking for other compilers and other flags. At least a general setup is in place.

The two scripts to be provided to Inline::C as compiler and linker are given at the end. Edit: Save them at the same location as the demo script below without changing their names.

Here is a basic Perl script running a cuda program on the GPU:

#!/usr/bin/perl

# by bliako @ PerlMonks.org
# date: 01-Jul-2021
# see https://perlmonks.org/?node_id=11134582
# lame example for utilising GPGPU via Inline::C
# TODO: extend to taking params and returning back results

use strict;
use warnings;

use FindBin;

use Inline C => Config =>
    cc => $FindBin::Bin.'/nvcc-compile.pl',
    ld => $FindBin::Bin.'/nvcc-link.pl',
;

use Inline C => <<'EOC';
// from https://developer.nvidia.com/blog/easy-introduction-cuda-c-and
+-c/
#include <stdio.h>

__global__
void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i] + y[i];
}

int main()
{
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M elements
  saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(y[i]-4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
  return 0; // << late edit!
}
EOC
main;
[download]

nvcc-compile.pl

#!/usr/bin/perl

# nvcc-compile.pl
# by bliako @ PerlMonks.org
# date: 01-Jul-2021
# see https://perlmonks.org/?node_id=11134582
# tools for running cuda code on the GPU via Perl and Inline::C
# script to be provided to Inline::C as its 'cc' parameter, like
#  use Inline C => Config =>
#    cc => $FindBin::Bin.'/nvcc-compile.pl',
#    ld => $FindBin::Bin.'/nvcc-link.pl',
#;
# Below, set $EXE and $CC to point to nvcc and gcc commands
# Note that nvcc requires specific gcc versions ONLY

# WARNING: if you make changes here, it is unlikely that Inline::C
# will notice. It's better to delete the temp _Inline directory and st
+art afresh

use strict;
use warnings;

use Cwd;

my $verbose = 0;
my $EXE = '/usr/local/cuda/bin/nvcc';
my $CC = '/usr/local/gcc84/bin/gcc84';

#########################
# nothing to change below
#########################
my $PWD = Cwd::cwd;

my @remove = (
    qr/\-Werror=format\-security(?=\s|$)/,
    qr/\-m64(?=\s|$)/,
    qr/\-mtune=generic(?=\s|$)/,
    qr/\-iquote[^ ]*(?=\s|$)/,
    qr/\-grecord\-gcc\-switches(?=\s|$)/,
    qr/\-pipe(?=\s|$)/,
    qr/\-Wall(?=\s|$)/,
    qr/\-Wp,\-D_FORTIFY_SOURCE=[0-9]+(?=\s|$)/,
    qr/\-Wp,\-D_GLIBCXX_ASSERTIONS(?=\s|$)/,
    qr/\-specs=[^ ]+(?=\s|$)/,
    qr/\-DVERSION=[^ ]+(?=\s|$)/,
    qr/\-DXS_VERSION=[^ ]+(?=\s|$)/,
);

my @replace_compiler_options = (
    qr/(\-flto=auto)(?=\s|$)/,
    qr/(\-ffat\-lto\-objects)(?=\s|$)/,
    qr/(\-fexceptions)(?=\s|$)/,
    qr/(\-fstack\-protector\-strong)(?=\s|$)/,
    qr/(\-fasynchronous\-unwind\-tables)(?=\s|$)/,
    qr/(\-fstack\-clash\-protection)(?=\s|$)/,
    qr/(\-fcf\-protection)(?=\s|$)/,
    qr/(\-fwrapv)(?=\s|$)/,
    qr/(\-fPIC)(?=\s|$)/,
    qr/(\-fno\-strict\-aliasing)(?=\s|$)/,
    qr/(\-Wl,\-\-as-needed)(?=\s|$)/,
    qr/(\-Wl,\-z,now)(?=\s|$)/,
);

my @newarg;
for my $anarg (@ARGV){
    print "processing '$anarg'\n" if $verbose;
    for my $q (@remove){
        if( $anarg =~ s/$q//g ){ print "removing $q...\n" if $verbose 
+}
    }
    for my $q (@replace_compiler_options){
        if( $anarg =~ s/$q/-Xcompiler \\"$1\\"/g ){ print "replacing $
+q...\n" if $verbose }
    }
    if( $anarg !~ /^\s*$/ ){ push @newarg, $anarg }
}

# hack to change the file extension from .c to .cu
# assumes that the file to compile is the last in @ARGV(!!!)
my $cfile = $newarg[-1];
my $cufile = $cfile;
$cufile =~ s/.c$/.cu/;
$newarg[-1] = $cufile;
my $cmdstr = "cp '$cfile' '$cufile'";
die "failed" if mysystem($cmdstr);

$cmdstr = $EXE." --compiler-bindir /usr/local/gcc84/bin/gcc84 ".join("
+ ", @newarg);
print "$0 : executing:\n$cmdstr\n";
die "failed" if mysystem($cmdstr); 

#system($EXE, @newarg);

sub mysystem {
    my @args = @_;
    system(@args);

    if ($? == -1) {
        print STDERR "failed to execute: $!\n";
        return 1;
    }
    elsif ($? & 127) {
        printf STDERR "child died with signal %d, %s coredump\n",
            ($? & 127),  ($? & 128) ? 'with' : 'without';
        return 1;
    }

    my $ex = $? >> 8;
    if( $ex ){
        print STDERR "error, system command failed with exit code $ex"
+;
        return 1;
    }
    printf "success, system command executed.\n";
    return 0;
}
[download]

nvcc-link.pl

#!/usr/bin/perl

# nvcc-link.pl
# by bliako @ PerlMonks.org
# date: 01-Jul-2021
# see https://perlmonks.org/?node_id=11134582
# tools for running cuda code on the GPU via Perl and Inline::C
# script to be provided to Inline::C as its 'ld' parameter, like
#  use Inline C => Config =>
#       cc => $FindBin::Bin.'/nvcc-compile.pl',
#       ld => $FindBin::Bin.'/nvcc-link.pl',
#;
# Below, set $EXE and $CC to point to nvcc and gcc commands
# Note that nvcc requires specific gcc versions ONLY 

# WARNING: if you make changes here, it is unlikely that Inline::C
# will notice. It's better to delete the temp _Inline directory and st
+art afresh

use strict;
use warnings;

use Cwd;

my $verbose = 0;
my $EXE = '/usr/local/cuda/bin/nvcc';
my $CC = '/usr/local/gcc84/bin/gcc84';

###########################
# nothing tho change below
###########################

my $PWD = Cwd::cwd;

my @remove = (
    qr/\-Werror=format\-security(?=\s|$)/,
    qr/\-m64(?=\s|$)/,
    qr/\-mtune=generic(?=\s|$)/,
    qr/\-iquote[^ ]*(?=\s|$)/,
    qr/\-grecord\-gcc\-switches(?=\s|$)/,
    qr/\-pipe(?=\s|$)/,
    qr/\-Wall(?=\s|$)/,
    qr/\-Wp,\-D_FORTIFY_SOURCE=[0-9]+(?=\s|$)/,
    qr/\-Wp,\-D_GLIBCXX_ASSERTIONS(?=\s|$)/,
    qr/\-specs=[^ ]+(?=\s|$)/,
    qr/\-DVERSION=[^ ]+(?=\s|$)/,
    qr/\-DXS_VERSION=[^ ]+(?=\s|$)/,
);

my @replace_compiler_options = (
    qr/(\-flto=auto)(?=\s|$)/,
    qr/(\-ffat\-lto\-objects)(?=\s|$)/,
    qr/(\-fexceptions)(?=\s|$)/,
    qr/(\-fstack\-protector\-strong)(?=\s|$)/,
    qr/(\-fasynchronous\-unwind\-tables)(?=\s|$)/,
    qr/(\-fstack\-clash\-protection)(?=\s|$)/,
    qr/(\-fcf\-protection)(?=\s|$)/,
    qr/(\-fwrapv)(?=\s|$)/,
    qr/(\-fPIC)(?=\s|$)/,
    qr/(\-fno\-strict\-aliasing)(?=\s|$)/,
    qr/(\-Wl,\-z,relro)(?=\s|$)/,
    qr/(\-Wl,\-\-as-needed)(?=\s|$)/,
    qr/(\-Wl,\-z,now)(?=\s|$)/,
);

my @newarg;
for my $anarg (@ARGV){
    print "processing '$anarg'\n" if $verbose;
    for my $q (@remove){
        if( $anarg =~ s/$q//g ){ print "removing $q...\n" if $verbose 
+}
    }
    for my $q (@replace_compiler_options){
        if( $anarg =~ s/$q/-Xcompiler \\"$1\\"/g ){ print "replacing $
+q...\n" if $verbose }
    }
    if( $anarg !~ /^\s*$/ ){ push @newarg, $anarg }
}

my $cmdstr = $EXE." --compiler-bindir ${CC} ".join(" ", @newarg);
print "$0 : executing:\n$cmdstr\n";
die "failed" if mysystem($cmdstr); 

sub mysystem {
    my @args = @_;
    system(@args);

    if ($? == -1) {
        print STDERR "failed to execute: $!\n";
        return 1;
    }
    elsif ($? & 127) {
        printf STDERR "child died with signal %d, %s coredump\n",
            ($? & 127),  ($? & 128) ? 'with' : 'without';
        return 1;
    }

    my $ex = $? >> 8;
    if( $ex ){
        print STDERR "error, system command failed with exit code $ex"
+;
        return 1;
    }
    printf "success, system command executed.\n";
    return 0;
}
[download]

At the moment, I have not implemented communicating parameters to and from the inlined cuda code. Feel free to extend.

Suggestions: Inline::C can be modified in order to avoid my ugly hacks, or a new Inline::Cuda can be built.

These are interesting times. This is a small step in making them fun-ner and lazy-er too. A big Thank You to the author of Inline::C and Nvidia.

Tested on Linux with (older) gcc version 8.4, Nvidia's Cuda compilation tools version 11.4.48, Nvidia graphics driver 470.42.01, Perl version 5.32.1, Inline::C version 0.81

Edits: main demo script added a return 0; at the end of main()

bw, bliako

Comment on Compile and possibly run cuda code on the GPU via Perl's Inline::C Select or Download Code

Replies are listed 'Best First'.

Re: Compile and run cuda code on the GPU via Perl's Inline::C - passing parameters
by bliako (Monsignor) on Jul 03, 2021 at 05:58 UTC

The following code demonstrates how to pass parameters to the function (do_saxpy()) living in Inline::C and how to get back the results. Both as arrayrefs. (see also: Inline::C::Cookbook)

Edit: caveat: return NULL does not translate back to undef in perl-space. Is there an AV-equaivalent for &PL_sv_undef?

#!/usr/bin/perl

# by bliako @ PerlMonks.org
# date: 01-Jul-2021
# see https://perlmonks.org/?node_id=11134582
# lame example for utilising GPGPU via Inline::C
# TODO: extend to taking params and returning back results

use strict;
use warnings;

use FindBin;

use Inline C => Config =>
    cc => $FindBin::Bin.'/nvcc-compile.pl',
    ld => $FindBin::Bin.'/nvcc-link.pl',
;

use Inline C => <<'EOC';
// from https://developer.nvidia.com/blog/easy-introduction-cuda-c-and
+-c/
// NOTE: don't use main(void), use main()!!!
#include <stdio.h>

AV *do_saxpy(int N, SV *_x, SV *_y);
int array_numelts(SV *array);

__global__
void saxpy(int n, double a, double *x, double *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i] + y[i];
}

int array_numelts(SV *array){
    int numelts;
    if( (!SvROK(array))
     || (SvTYPE(SvRV(array)) != SVt_PVAV)
     || ((numelts = av_len((AV *)SvRV(array))) < 0)
    ) return -1;
    return numelts;
}

/* returns an arrayref of results */
AV* do_saxpy(
    int N,
    SV *_x,
    SV *_y
)
{
  double *x, *y, *d_x, *d_y;
  int nX, nY, i;

  AV *ret = newAV();
  sv_2mortal((SV*)ret);

  if( N <= 0 ){ fprintf(stderr, "error, N must be positive.\n"); retur
+n NULL; }

  if( ((nX=array_numelts(_x))<0)
    ||((nY=array_numelts(_y))<0)
  ){ fprintf(stderr, "err\n"); return NULL; }

  x = (double*)malloc(N*sizeof(double));
  y = (double*)malloc(N*sizeof(double));

  cudaMalloc(&d_x, N*sizeof(double)); 
  cudaMalloc(&d_y, N*sizeof(double));

  AV *deref_x = (AV *)SvRV(_x),
     *deref_y = (AV *)SvRV(_y);
  SV **dummy;
  for(i=0;i<N;i++){
    dummy = av_fetch(deref_x, i, 0);
    x[i] = SvNV(*dummy);
    dummy = av_fetch(deref_y, i, 0);
    y[i] = SvNV(*dummy);
    printf("do_saxpy() : got in x[%d]=%lf and y[%d]=%lf\n", i, x[i], i
+, y[i]);
  }

  cudaMemcpy(d_x, x, N*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(double), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M elements
  saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);

  // this copies data from GPU (dy) onto CPU memory, we use y because
  // it's just sitting there and no longer needed
  cudaMemcpy(y, d_y, N*sizeof(double), cudaMemcpyDeviceToHost);

  /* add some rubbish to return back as array ref */
  for(i=0;i<N;i++){
    av_push(ret, newSVnv(y[i]));
  }

  double maxError = 0.0f;
  for(i=0;i<N;i++){
    maxError = max(maxError, abs(y[i]-4.0f));
  }
  printf("do_saxpy() : Max error: %f\n", maxError);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);

  return ret;
}
EOC

my $N = 100; #1<<20;

my @x = map { rand() } 1..$N;
my @y = map { rand() } 1..$N;

my $err = do_saxpy($N, \@x, \@y);
if( ! defined $err ){ print STDERR "$0 : error, call to do_saxpy() has
+ failed.\n"; exit(1); }
printf "$0 : back to perl-code ...\n";
print "$0 : (perl-code) : got back result :\n".join("\n", @$err)."\n";
[download]

bw, bliako

[reply]
[d/l]
[select]


Just another Perl shrine
	PerlMonks