Hello World

Setting the following environment variables is highly recommended to make life easier.

$ source <install_path>/setup.source # default install_path would be $HOME/.iris

The “Hello World” program is the first step towards learning IRIS. This program displays the message “HELLO WORLD” on the screen.

$ cd iris/apps/helloworld
$ make
$ ./helloworld
HELLO WORLD
$

Host Code

C

#include <iris/iris.h>
#include <stdio.h>

char a[12] = "hello world";
char b[12];
size_t size = 12;

int main(int argc, char** argv) {
  iris_init(&argc, &argv, 1);

  iris_mem mem_a;
  iris_mem mem_b;
  iris_mem_create(size, &mem_a);
  iris_mem_create(size, &mem_b);

  iris_task task;
  iris_task_create(&task);
  iris_task_h2d(task, mem_a, 0, size, a);
  void* params[2] = { &mem_b, &mem_a };
  int params_info[2] = { iris_w, iris_r };
  iris_task_kernel(task, "uppercase", 1, NULL, &size, NULL, 2, params, params_info);
  iris_task_d2h(task, mem_b, 0, size, b);
  iris_task_submit(task, iris_roundrobin, NULL, 1);

  printf("%s\n", b);

  iris_task_release(task);
  iris_mem_release(mem_a);
  iris_mem_release(mem_b);

  iris_finalize();

  return 0;
}

C++

#include <iris/iris.hpp>
#include <stdio.h>

char a[12] = "hello world";
char b[12];
size_t size = 12;

int main(int argc, char** argv) {
  iris::Platform platform;
  platform.init(&argc, &argv, true);

  iris::Mem mem_a(size);
  iris::Mem mem_b(size);

  iris::Task task;
  task.h2d(&mem_a, 0, size, a);
  void* params[2] = { &mem_b, &mem_a };
  int params_info[2] = { iris_w, iris_r };
  task.kernel("uppercase", 1, NULL, &size, NULL, 2, params, params_info);
  task.d2h(&mem_b, 0, size, b);
  task.submit(iris_roundrobin, NULL, true);

  printf("%s\n", b);

  platform.finalize();

  return 0;
}

Kernels

CUDA

extern "C" __global__ void uppercase(char* b, char* a) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
  else b[i] = a[i];
}

HIP

#include <hip/hip_runtime.h>

extern "C" __global__ void saxpy(float* Z, float A, float* X, float* Y) {
  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
  Z[id] = A * X[id] + Y[id];
}

extern "C" __global__ void saxpy_with_offsets(float* Z, float A, float* X, float* Y, size_t blockOff_x, size_t blockOff_y, size_t blockOff_z) {
  size_t id = (blockOff_x + blockIdx.x) * blockDim.x + threadIdx.x;
  Z[id] = A * X[id] + Y[id];
}

OpenCL

__kernel void uppercase(__global char* b, __global char* a) {
  int i = get_global_id(0);
  if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
  else b[i] = a[i];
}

OpenMP

#include <iris/iris_openmp.h>

static void uppercase(char* b, char* a, IRIS_OPENMP_KERNEL_ARGS) {
  int i;
#pragma omp parallel for shared(b, a) private(i)
  IRIS_OPENMP_KERNEL_BEGIN(i)
  if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
  else b[i] = a[i];
  IRIS_OPENMP_KERNEL_END
}

SAXPY

SAXPY stands for “Single-precision A * X Plus Y”. It is a combination of scalar multiplication and vector addition.

$ cd iris/apps/saxpy
$ make
$ ./saxpy-c
X [  0.  1.  2.  3.  4.  5.  6.  7.]
Y [  0.  1.  2.  3.  4.  5.  6.  7.]
S = 10.000000 * X + Y [   0.  11.  22.  33.  44.  55.  66.  77.]
$

Host Code

C

#include <iris/iris.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>

int main(int argc, char** argv) {
  iris_init(&argc, &argv, 1);

  size_t SIZE;
  int TARGET;
  int VERBOSE;
  float *X, *Y, *Z;
  float A = 10;
  int ERROR = 0;

  SIZE = argc > 1 ? atol(argv[1]) : 8;
  TARGET = argc > 2 ? atol(argv[2]) : 0;
  VERBOSE = argc > 3 ? atol(argv[3]) : 1;

  printf("[%s:%d] SIZE[%zu] TARGET[%d] VERBOSE[%d]\n", __FILE__, __LINE__, SIZE, TARGET, VERBOSE);

  X = (float*) malloc(SIZE * sizeof(float));
  Y = (float*) malloc(SIZE * sizeof(float));
  Z = (float*) malloc(SIZE * sizeof(float));

  if (VERBOSE) {

  for (int i = 0; i < SIZE; i++) {
    X[i] = i;
    Y[i] = i;
  }

  printf("X [");
  for (int i = 0; i < SIZE; i++) printf(" %2.0f.", X[i]);
  printf("]\n");
  printf("Y [");
  for (int i = 0; i < SIZE; i++) printf(" %2.0f.", Y[i]);
  printf("]\n");

  }

  iris_mem mem_X;
  iris_mem mem_Y;
  iris_mem mem_Z;
  iris_mem_create(SIZE * sizeof(float), &mem_X);
  iris_mem_create(SIZE * sizeof(float), &mem_Y);
  iris_mem_create(SIZE * sizeof(float), &mem_Z);

  iris_task task0;
  iris_task_create(&task0);
  iris_task_h2d_full(task0, mem_X, X);
  iris_task_h2d_full(task0, mem_Y, Y);
  void* saxpy_params[4] = { &mem_Z, &A, &mem_X, &mem_Y };
  int saxpy_params_info[4] = { iris_w, sizeof(A), iris_r, iris_r };
  iris_task_kernel(task0, "saxpy", 1, NULL, &SIZE, NULL, 4, saxpy_params, saxpy_params_info);
  iris_task_d2h_full(task0, mem_Z, Z);
  iris_task_submit(task0, TARGET, NULL, 1);

  if (VERBOSE) {

  for (int i = 0; i < SIZE; i++) {
    //printf("[%8d] %8.1f = %4.0f * %8.1f + %8.1f\n", i, Z[i], A, X[i], Y[i]);
    if (Z[i] != A * X[i] + Y[i]) ERROR++;
  }

  printf("S = %f * X + Y [", A);
  for (int i = 0; i < SIZE; i++) printf(" %3.0f.", Z[i]);
  printf("]\n");

  }

  iris_mem_release(mem_X);
  iris_mem_release(mem_Y);
  iris_mem_release(mem_Z);

  free(X);
  free(Y);
  free(Z);

  //iris_task_release(task0);

  iris_finalize();

  return 0;
}

C++

#include <iris/iris.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>

int main(int argc, char** argv) {
  iris::Platform platform;
  platform.init(&argc, &argv, 1);

  size_t SIZE;
  float *X, *Y, *Z;
  float A = 10;
  int ERROR = 0;

  //int nteams = 8;

  SIZE = argc > 1 ? atol(argv[1]) : 8;
  //size_t chunk_size = SIZE / nteams;

  X = (float*) malloc(SIZE * sizeof(float));
  Y = (float*) malloc(SIZE * sizeof(float));
  Z = (float*) malloc(SIZE * sizeof(float));

  for (int i = 0; i < SIZE; i++) {
    X[i] = i;
    Y[i] = i;
  }

  printf("X [");
  for (int i = 0; i < SIZE; i++) printf(" %2.0f.", X[i]);
  printf("]\n");
  printf("Y [");
  for (int i = 0; i < SIZE; i++) printf(" %2.0f.", Y[i]);
  printf("]\n");

#if 0
  iris::Mem mem_X(SIZE * sizeof(float));
  iris::Mem mem_Y(SIZE * sizeof(float));
  iris::Mem mem_Z(SIZE * sizeof(float));

  iris::Task task;
  task.h2d_full(&mem_X, X);
  task.h2d_full(&mem_Y, Y);
  void* params0[4] = { &mem_Z, &A, &mem_X, &mem_Y };
  int pinfo0[4] = { iris_w, sizeof(A), iris_r, iris_r };
  task.kernel("saxpy", 1, NULL, &SIZE, NULL, 4, params0, pinfo0);
  task.d2h_full(&mem_Z, Z);
  task.submit(1, NULL, 1);
#else
  iris::DMem mem_X(X, SIZE * sizeof(float));
  iris::DMem mem_Y(Y, SIZE * sizeof(float));
  iris::DMem mem_Z(Z, SIZE * sizeof(float));

  iris::Task task;
  void* params0[4] = { &mem_Z, &A, &mem_X, &mem_Y };
  int pinfo0[4] = { iris_w, sizeof(A), iris_r, iris_r };
  task.kernel("saxpy", 1, NULL, &SIZE, NULL, 4, params0, pinfo0);
  task.flush_out(mem_Z);
  task.submit(1, NULL, 1);
#endif
  for (int i = 0; i < SIZE; i++) {
    //printf("[%8d] %8.1f = %4.0f * %8.1f + %8.1f\n", i, Z[i], A, X[i], Y[i]);
    if (Z[i] != A * X[i] + Y[i]) ERROR++;
  }

  printf("S = %f * X + Y [", A);
  for (int i = 0; i < SIZE; i++) printf(" %3.0f.", Z[i]);
  printf("]\n");

  free(X);
  free(Y);
  free(Z);

  platform.finalize();

  return 0;
}

Fortran

PROGRAM SAXPY
    USE IRIS
    IMPLICIT NONE

    INTEGER :: I, IERROR
    INTEGER(8) :: SIZE

    REAL(4),DIMENSION(:),ALLOCATABLE :: Z
    REAL(4),DIMENSION(:),ALLOCATABLE :: X
    REAL(4),DIMENSION(:),ALLOCATABLE :: Y
    REAL(4) :: A

    INTEGER(8) :: MEM_Z
    INTEGER(8) :: MEM_X
    INTEGER(8) :: MEM_Y

    INTEGER(8),DIMENSION(3) :: OFF
    INTEGER(8),DIMENSION(3) :: GWS
    INTEGER(8),DIMENSION(3) :: LWS
    INTEGER :: NPARAMS
    INTEGER(8),DIMENSION(4) :: PARAMS
    INTEGER,DIMENSION(4) :: PARAMS_INFO

    INTEGER(8) :: TASK

    CALL IRIS_INIT(.TRUE., IERROR)
    IF (IERROR /= IRIS_SUCCESS) THEN
        PRINT*, 'FAILED AT INIT'
    ENDIF

    SIZE = 8

    ALLOCATE(Z(SIZE))
    ALLOCATE(X(SIZE))
    ALLOCATE(Y(SIZE))

    A = 10.0

    DO I = 1, SIZE
        X(I) = I
        Y(I) = I
    ENDDO

    DO I = 1, SIZE
        PRINT*, 'X[', I, '] ', X(I)
    ENDDO

    PRINT*, '==='

    DO I = 1, SIZE
        PRINT*, 'Y[', I, '] ', Y(I)
    ENDDO

    CALL IRIS_MEM_CREATE(4 * SIZE, MEM_X, IERROR)
    CALL IRIS_MEM_CREATE(4 * SIZE, MEM_Y, IERROR)
    CALL IRIS_MEM_CREATE(4 * SIZE, MEM_Z, IERROR)

    CALL IRIS_TASK_CREATE(TASK, IERROR)

    OFF(1) = 0
    GWS(1) = SIZE
    LWS(1) = SIZE
    NPARAMS = 4
    PARAMS = (/ MEM_Z, TRANSFER(A, TASK), MEM_X, MEM_Y /)
    PARAMS_INFO = (/ IRIS_RW, 4, IRIS_R, IRIS_R /)

    CALL IRIS_TASK_H2D_FULL(TASK, MEM_X, X, IERROR)
    CALL IRIS_TASK_H2D_FULL(TASK, MEM_Y, Y, IERROR)
    CALL IRIS_TASK_KERNEL(TASK, "saxpy", 1, OFF, GWS, LWS, &
      NPARAMS, PARAMS, PARAMS_INFO, IERROR)
    CALL IRIS_TASK_D2H_FULL(TASK, MEM_Z, Z, IERROR)
    CALL IRIS_TASK_SUBMIT(TASK, IRIS_GPU, .TRUE., IERROR)

    DO I = 1, SIZE
        PRINT*, 'Z[', I, '] ', Z(I)
    ENDDO

    DEALLOCATE(X)
    DEALLOCATE(Y)
    DEALLOCATE(Z)

    CALL IRIS_FINALIZE(IERROR)

END PROGRAM SAXPY

Python

#!/usr/bin/env python3

import iris
import numpy as np
import sys

iris.init()

SIZE = 8 if len(sys.argv) == 1 else int(sys.argv[1])
A = 10.0

x = np.arange(SIZE, dtype=np.float32)
y = np.arange(SIZE, dtype=np.float32)
s = np.arange(SIZE, dtype=np.float32)

print('X', x)
print('Y', y)

old_way = True
disable = True
if not disable:
    if old_way:
        mem_x = iris.mem(x.nbytes)
        mem_y = iris.mem(y.nbytes)
        mem_s = iris.mem(s.nbytes)

        task = iris.task()
        task.h2d_full(mem_x, x)
        task.h2d_full(mem_y, y)
        task.kernel("saxpy", 1, [], [SIZE], [], [mem_s, A, mem_x, mem_y] , [iris.iris_w, 4, iris.iris_r, iris.iris_r] )
        task.d2h_full(mem_s, s)
        task.submit(iris.iris_gpu)
    else:
        # New DMEM way
        task = iris.task("saxpy", 1, [], [SIZE], [], [
                (s, iris.iris_w, iris.iris_flush),
                A, 
                (x, iris.iris_r), 
                (y, iris.iris_r)
                ])
        task.submit(iris.iris_gpu)
    
print('S =', A, '* X + Y', s)

s0 = np.arange(SIZE, dtype=np.float32)
s1 = np.arange(SIZE, dtype=np.float32)

mem_x = iris.dmem(x)
mem_y = iris.dmem(y)
#mem_s0 = iris.dmem(s0)
mem_s0 = iris.dmem_null(s0.nbytes)
mem_s1 = iris.dmem(s1)
task0 = iris.task("saxpy", 1, [], [SIZE], [], [
        (mem_s0, iris.iris_w),
        A, 
        (mem_x, iris.iris_r), 
        (mem_y, iris.iris_r)
        ])
task1 = iris.task("saxpy", 1, [], [SIZE], [], [
        (mem_s1, iris.iris_w, iris.iris_flush),
        A, 
        (mem_s0, iris.iris_r), 
        (mem_s0, iris.iris_r)
        ])
task1.depends(task0)
#task0.submit(iris.iris_gpu)
#task1.submit(iris.iris_gpu)

#print('S =', A, '* X + Y', x, y, s0)
#print('S =', A, '* X + Y', s0, s0, s1)

x = x*100
mem_x.update(x)

graph = iris.graph([task0, task1])
graph.submit()
graph.wait()

print('S =', A, '* X + Y', x, y, s0)
print('S =', A, '* X + Y', s0, s0, s1)

x = x*100
mem_x.update(x)
graph.submit()
graph.wait()

ntasks, tasks = graph.get_tasks()
print('S =', A, '* X + Y', x, y, s0)
print('S =', A, '* X + Y', s0, s0, s1)



iris.finalize()

Kernels

CUDA

extern "C" __global__ void saxpy(float* Z, float A, float* X, float* Y) {
  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
  Z[id] = A * X[id] + Y[id];
}

HIP

#include <hip/hip_runtime.h>

extern "C" __global__ void saxpy(float* Z, float A, float* X, float* Y) {
  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
  Z[id] = A * X[id] + Y[id];
}

OpenCL

__kernel void saxpy(__global float* restrict Z, float A, __global float* restrict X, __global float* restrict Y) {
  size_t id = get_global_id(0);
  Z[id] = A * X[id] + Y[id];
}

OpenMP

#include <iris/iris_openmp.h>

static void saxpy(float* Z, float A, float* X, float* Y, IRIS_OPENMP_KERNEL_ARGS) {
  size_t i;
#pragma omp parallel for shared(Z, A, X, Y) private(i)
  IRIS_OPENMP_KERNEL_BEGIN(i)
  Z[i] = A * X[i] + Y[i];
  IRIS_OPENMP_KERNEL_END
}

Data Memory

One of the major benefits of using IRIS is its “data memory” feature, which automatically manage data movement independent of the device scheduling. Here is an example of the use of data memory during a vector addition code. Note how the:

iris_data_mem_create(&mem_A, A, SIZE * sizeof(int));
...
iris_task_dmem_flush_out(task0,mem_C);

call differs from the SAXPY example above. We no longer need iris_task_h2d_full and iris_task_d2h_full calls, instead, we only need to know when to flush the final memory transfer required by the host. This is a simpler work-flow that the conventional explicit memory movement approach.

Running

$ cd iris/apps/vecadd
$ make
$ ./vecadd-iris

Host Code

C++

#include <iris/iris.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>

int main(int argc, char** argv) {
  size_t SIZE;
  int *A, *B, *C;
  int ERROR = 0;

  iris_init(&argc, &argv, true);

  SIZE = argc > 1 ? atol(argv[1]) : 16;
  printf("SIZE[%lu]\n", SIZE);

  A = (int*) valloc(SIZE * sizeof(int));
  B = (int*) valloc(SIZE * sizeof(int));
  C = (int*) valloc(SIZE * sizeof(int));

  for (int i = 0; i < SIZE; i++) {
    A[i] = i;
    B[i] = i;
    C[i] = 0;
  }

  iris_mem mem_A;
  iris_mem mem_B;
  iris_mem mem_C;
  iris_data_mem_create(&mem_A, A, SIZE * sizeof(int));
  iris_data_mem_create(&mem_B, B, SIZE * sizeof(int));
  iris_data_mem_create(&mem_C, C, SIZE * sizeof(int));

  iris_task task0;
  iris_task_create(&task0);
  void* params0[3] = { &mem_A, &mem_B, &mem_C };
  int pinfo0[3] = { iris_r, iris_r, iris_w };
  iris_task_kernel(task0, "vecadd", 1, NULL, &SIZE, NULL, 3, params0, pinfo0);
  iris_task_dmem_flush_out(task0,mem_C);
  iris_task_submit(task0, iris_any,nullptr, true);
  iris_synchronize();

  for (int i = 0; i < SIZE; i++) {
    printf("C[%d] = %d\n", i, C[i]);
    if (C[i] != (A[i] + B[i])) ERROR++;
  }
  iris_finalize();
  printf("ERROR[%d]\n", ERROR+iris_error_count());
  return ERROR+iris_error_count();
}

Kernels

CUDA

extern "C" __global__ void vecadd(int* A, int* B, int* C) {
  int id = blockIdx.x * blockDim.x + threadIdx.x;
  C[id] = A[id] + B[id];
}

HIP

#include <hip/hip_runtime.h>

extern "C" __global__ void vecadd(int* A, int* B, int* C) {
  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
  C[i] = A[i] + B[i];
}

OpenCL

__kernel void vecadd(__global int* restrict A, __global int* restrict B, __global int* restrict C) {
  size_t id = get_global_id(0);
  C[id] = A[id] + B[id];
}

OpenMP

#include <iris/iris_openmp.h>

static void vecadd(int* A, int* B, int* C, IRIS_OPENMP_KERNEL_ARGS) {
  int i;
#pragma omp parallel for shared(C, A, B) private(i)
  IRIS_OPENMP_KERNEL_BEGIN(i)
  C[i] = A[i] + B[i];
  IRIS_OPENMP_KERNEL_END
}

Device Selection

IRIS opportunistically attempts to use all available devices and backends, it resolves task names to function names in the corresponding kernel binaries. It allows device selection to be set both at compile and at runtime.

Compile Time

The user can submit the device target(s) for when the task is submitted:

iris_task_submit(iris_task task, int device, const char* opt, int sync);

This task submission includes information about the task, such as a hint, target device parameter, synchronization mode (blocking or non-blocking), and policy selector that indicates where the task should be executed. The device is the device submission policy. The complete list of available targets are:

Device Policy

About

iris_cpu

Submit the task to a CPU device

iris_gpu

Submit the task to any GPU device

iris_fpga

Submit the task to any FPGA (currently Intel and Xilinx)

iris_dsp

Submit the task to any DSP device (currently Hexagon)

iris_nvidia

Submit the task to an NVIDIA GPU device

iris_amd

Submit the task to an AMD GPU device

iris_gpu_intel

Submit the task to an Intel GPU device

iris_phi

Submit the task to an Intel Xeon Phi device

We can also submit tasks according to a scheduling policy:

Scheduling Policy

About

iris_default

Use the first device

iris_roundrobin

Submit this task in a round-robin (cyclic) way, for equal work sharing

iris_depend

Submit this task to a device that has been assigned its dependent

iris_data

Submit task to device to minimize data movement

iris_profile

Submit the task to the device based on execution time history

iris_random

Randomly assign this task to any of the available devices

iris_pending

Delay submitting the task until the memory it depends on has been assigned, then use that device

iris_any

Submit task to the device with the fewest assigned tasks

iris_all

Submit the task to all device queues, the device that accesses it first has exclusive execution (it is removed from the other device queues)

iris_custom

Submit the task based on a used provided, custom policy

The opt parameter is for iris_custom policies.

Runtime

We can also filter out devices at runtime by setting the IRIS_ARCHS environment variable. Modifying the selection of backends to instantiate allows dynamic device targets—without requiring recompilation. All current options are:hip,``cuda``,``opencl``, and openmp. An example of only allowing execution on openmp and hip devices would then be:

$ export IRIS_ARCHS=hip,openmp
$ ./helloworld
HELLO WORLD
$