Hello World

Setting the following environment variables is highly recommended to make life easier.

$ export IRIS=<install_path> # install_path would be $HOME/.local
$ export CPATH=$CPATH:$IRIS/include
$ export LIBRARY_PATH=$LIBRARY_PATH:$IRIS/lib:$IRIS/lib64
$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$IRIS/lib:$IRIS/lib64
$ export PYTHONPATH=$PYTHONPATH:$IRIS/include

The “Hello World” program is the first step towards learning IRIS. This program displays the message “HELLO WORLD” on the screen.

$ cd iris/apps/helloworld
$ make
$ ./helloworld
HELLO WORLD
$

Host Code

C

#include <iris/iris.h>
#include <stdio.h>

char a[12] = "hello world";
char b[12];
size_t size = 12;

int main(int argc, char** argv) {
  iris_init(&argc, &argv, 1);

  iris_mem mem_a;
  iris_mem mem_b;
  iris_mem_create(size, &mem_a);
  iris_mem_create(size, &mem_b);

  iris_task task;
  iris_task_create(&task);
  iris_task_h2d(task, mem_a, 0, size, a);
  void* params[2] = { mem_b, mem_a };
  int params_info[2] = { iris_w, iris_r };
  iris_task_kernel(task, "uppercase", 1, NULL, &size, NULL, 2, params, params_info);
  iris_task_d2h(task, mem_b, 0, size, b);
  iris_task_submit(task, iris_roundrobin, NULL, 1);

  printf("%s\n", b);

  iris_task_release(task);
  iris_mem_release(mem_a);
  iris_mem_release(mem_b);

  iris_finalize();

  return 0;
}

C++

#include <iris/iris.hpp>
#include <stdio.h>

char a[12] = "hello world";
char b[12];
size_t size = 12;

int main(int argc, char** argv) {
  iris::Platform platform;
  platform.init(&argc, &argv, true);

  iris::Mem mem_a(size);
  iris::Mem mem_b(size);

  iris::Task task;
  task.h2d(&mem_a, 0, size, a);
  void* params[2] = { &mem_b, &mem_a };
  int params_info[2] = { iris_w, iris_r };
  task.kernel("uppercase", 1, NULL, &size, NULL, 2, params, params_info);
  task.d2h(&mem_b, 0, size, b);
  task.submit(iris_roundrobin, NULL, true);

  printf("%s\n", b);

  platform.finalize();

  return 0;
}

Kernels

CUDA

extern "C" __global__ void uppercase(char* b, char* a) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
  else b[i] = a[i];
}

HIP

#include <hip/hip_runtime.h>

extern "C" __global__ void uppercase(char* b, char* a) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
  else b[i] = a[i];
}

OpenCL

__kernel void uppercase(__global char* b, __global char* a) {
  int i = get_global_id(0);
  if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
  else b[i] = a[i];
}

OpenMP

#include <iris/iris_openmp.h>

static void uppercase(char* b, char* a, IRIS_OPENMP_KERNEL_ARGS) {
  int i = 0;
#pragma omp parallel for shared(b, a) private(i)
  IRIS_OPENMP_KERNEL_BEGIN(i)
  if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
  else b[i] = a[i];
  IRIS_OPENMP_KERNEL_END
}

Hexagon

#include <iris/iris_hexagon_imp.h>

AEEResult irishexagon_uppercase(char* b, int blen, char* a, int alen, IRIS_HEXAGON_KERNEL_ARGS) {
  int32 i = 0;
  IRIS_HEXAGON_KERNEL_BEGIN(i)
  if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
  else b[i] = a[i];
  IRIS_HEXAGON_KERNEL_END
  return AEE_SUCCESS;
}

SAXPY

SAXPY stands for “Single-precision A * X Plus Y”. It is a combination of scalar multiplication and vector addition.

$ cd iris/apps/saxpy
$ make
$ ./saxpy-c
X [  0.  1.  2.  3.  4.  5.  6.  7.]
Y [  0.  1.  2.  3.  4.  5.  6.  7.]
S = 10.000000 * X + Y [   0.  11.  22.  33.  44.  55.  66.  77.]
$

Host Code

C

#include <iris/iris.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>

int main(int argc, char** argv) {
  iris_init(&argc, &argv, 1);

  size_t SIZE;
  int TARGET;
  int VERBOSE;
  float *X, *Y, *Z;
  float A = 10;
  int ERROR = 0;

  SIZE = argc > 1 ? atol(argv[1]) : 8;
  TARGET = argc > 2 ? atol(argv[2]) : 0;
  VERBOSE = argc > 3 ? atol(argv[3]) : 1;

  printf("[%s:%d] SIZE[%zu] TARGET[%d] VERBOSE[%d]\n", __FILE__, __LINE__, SIZE, TARGET, VERBOSE);

  X = (float*) malloc(SIZE * sizeof(float));
  Y = (float*) malloc(SIZE * sizeof(float));
  Z = (float*) malloc(SIZE * sizeof(float));

  if (VERBOSE) {

  for (int i = 0; i < SIZE; i++) {
    X[i] = i;
    Y[i] = i;
  }

  printf("X [");
  for (int i = 0; i < SIZE; i++) printf(" %2.0f.", X[i]);
  printf("]\n");
  printf("Y [");
  for (int i = 0; i < SIZE; i++) printf(" %2.0f.", Y[i]);
  printf("]\n");

  }

  iris_mem mem_X;
  iris_mem mem_Y;
  iris_mem mem_Z;
  iris_mem_create(SIZE * sizeof(float), &mem_X);
  iris_mem_create(SIZE * sizeof(float), &mem_Y);
  iris_mem_create(SIZE * sizeof(float), &mem_Z);

  iris_task task0;
  iris_task_create(&task0);
  iris_task_h2d_full(task0, mem_X, X);
  iris_task_h2d_full(task0, mem_Y, Y);
  void* saxpy_params[4] = { mem_Z, &A, mem_X, mem_Y };
  int saxpy_params_info[4] = { iris_w, sizeof(A), iris_r, iris_r };
  iris_task_kernel(task0, "saxpy", 1, NULL, &SIZE, NULL, 4, saxpy_params, saxpy_params_info);
  iris_task_d2h_full(task0, mem_Z, Z);
  iris_task_submit(task0, TARGET, NULL, 1);

  if (VERBOSE) {

  for (int i = 0; i < SIZE; i++) {
    if (Z[i] != A * X[i] + Y[i]) ERROR++;
  }

  printf("S = %f * X + Y [", A);
  for (int i = 0; i < SIZE; i++) printf(" %3.0f.", Z[i]);
  printf("]\n");

  }

  iris_mem_release(mem_X);
  iris_mem_release(mem_Y);
  iris_mem_release(mem_Z);

  free(X);
  free(Y);
  free(Z);

  iris_task_release(task0);

  iris_finalize();

  return 0;
}

C++

#include <iris/iris.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>

int main(int argc, char** argv) {
  iris::Platform platform;
  platform.init(&argc, &argv, 1);

  size_t SIZE;
  float *X, *Y, *Z;
  float A = 10;
  int ERROR = 0;

  int nteams = 8;
  int chunk_size = SIZE / nteams;

  SIZE = argc > 1 ? atol(argv[1]) : 8;

  X = (float*) malloc(SIZE * sizeof(float));
  Y = (float*) malloc(SIZE * sizeof(float));
  Z = (float*) malloc(SIZE * sizeof(float));

  for (int i = 0; i < SIZE; i++) {
    X[i] = i;
    Y[i] = i;
  }

  printf("X [");
  for (int i = 0; i < SIZE; i++) printf(" %2.0f.", X[i]);
  printf("]\n");
  printf("Y [");
  for (int i = 0; i < SIZE; i++) printf(" %2.0f.", Y[i]);
  printf("]\n");

  iris::Mem mem_X(SIZE * sizeof(float));
  iris::Mem mem_Y(SIZE * sizeof(float));
  iris::Mem mem_Z(SIZE * sizeof(float));

  iris::Task task;
  task.h2d_full(&mem_X, X);
  task.h2d_full(&mem_Y, Y);
  void* params0[4] = { &mem_Z, &A, &mem_X, &mem_Y };
  int pinfo0[4] = { iris_w, sizeof(A), iris_r, iris_r };
  task.kernel("saxpy", 1, NULL, &SIZE, NULL, 4, params0, pinfo0);
  task.d2h_full(&mem_Z, Z);
  task.submit(1, NULL, 1);

  for (int i = 0; i < SIZE; i++) {
    if (Z[i] != A * X[i] + Y[i]) ERROR++;
  }

  printf("S = %f * X + Y [", A);
  for (int i = 0; i < SIZE; i++) printf(" %3.0f.", Z[i]);
  printf("]\n");

  free(X);
  free(Y);
  free(Z);

  platform.finalize();

  return 0;
}

Fortran

PROGRAM SAXPY
    USE IRIS
    IMPLICIT NONE

    INTEGER :: I, IERROR
    INTEGER(8) :: SIZE

    REAL(4),DIMENSION(:),ALLOCATABLE :: Z
    REAL(4),DIMENSION(:),ALLOCATABLE :: X
    REAL(4),DIMENSION(:),ALLOCATABLE :: Y
    REAL(4) :: A

    INTEGER(8) :: MEM_Z
    INTEGER(8) :: MEM_X
    INTEGER(8) :: MEM_Y

    INTEGER(8),DIMENSION(3) :: OFF
    INTEGER(8),DIMENSION(3) :: GWS
    INTEGER(8),DIMENSION(3) :: LWS
    INTEGER :: NPARAMS
    INTEGER(8),DIMENSION(4) :: PARAMS
    INTEGER,DIMENSION(4) :: PARAMS_INFO

    INTEGER(8) :: TASK

    CALL IRIS_INIT(.TRUE., IERROR)
    IF (IERROR /= IRIS_SUCCESS) THEN
        PRINT*, 'FAILED AT INIT'
    ENDIF

    SIZE = 8

    ALLOCATE(Z(SIZE))
    ALLOCATE(X(SIZE))
    ALLOCATE(Y(SIZE))

    A = 10.0

    DO I = 1, SIZE
        X(I) = I
        Y(I) = I
    ENDDO

    DO I = 1, SIZE
        PRINT*, 'X[', I, '] ', X(I)
    ENDDO

    PRINT*, '==='

    DO I = 1, SIZE
        PRINT*, 'Y[', I, '] ', Y(I)
    ENDDO

    CALL IRIS_MEM_CREATE(4 * SIZE, MEM_X, IERROR)
    CALL IRIS_MEM_CREATE(4 * SIZE, MEM_Y, IERROR)
    CALL IRIS_MEM_CREATE(4 * SIZE, MEM_Z, IERROR)

    CALL IRIS_TASK_CREATE(TASK, IERROR)

    OFF(1) = 0
    GWS(1) = SIZE
    LWS(1) = SIZE
    NPARAMS = 4
    PARAMS = (/ MEM_Z, TRANSFER(A, TASK), MEM_X, MEM_Y /)
    PARAMS_INFO = (/ IRIS_RW, 4, IRIS_R, IRIS_R /)

    CALL IRIS_TASK_H2D_FULL(TASK, MEM_X, X, IERROR)
    CALL IRIS_TASK_H2D_FULL(TASK, MEM_Y, Y, IERROR)
    CALL IRIS_TASK_KERNEL(TASK, "saxpy", 1, OFF, GWS, LWS, &
      NPARAMS, PARAMS, PARAMS_INFO, IERROR)
    CALL IRIS_TASK_D2H_FULL(TASK, MEM_Z, Z, IERROR)
    CALL IRIS_TASK_SUBMIT(TASK, IRIS_GPU, .TRUE., IERROR)

    DO I = 1, SIZE
        PRINT*, 'Z[', I, '] ', Z(I)
    ENDDO

    DEALLOCATE(X)
    DEALLOCATE(Y)
    DEALLOCATE(Z)

    CALL IRIS_FINALIZE(IERROR)

END PROGRAM SAXPY

Python

#!/usr/bin/env python3

import iris
import numpy as np
import sys

iris.init()

SIZE = 8 if len(sys.argv) == 1 else int(sys.argv[1])
A = 10.0

x = np.arange(SIZE, dtype=np.float32)
y = np.arange(SIZE, dtype=np.float32)
s = np.arange(SIZE, dtype=np.float32)

print('X', x)
print('Y', y)

mem_x = iris.mem(x.nbytes)
mem_y = iris.mem(y.nbytes)
mem_s = iris.mem(s.nbytes)

task = iris.task()
task.h2d_full(mem_x, x)
task.h2d_full(mem_y, y)
task.kernel("saxpy", 1, [], [SIZE], [], [mem_s, A, mem_x, mem_y] , [iris.iris_w, 4, iris.iris_r, iris.iris_r] )
task.d2h_full(mem_s, s)
task.submit(iris.iris_gpu)

print('S =', A, '* X + Y', s)

iris.finalize()

Kernels

CUDA

extern "C" __global__ void saxpy(float* S, float A, float* X, float* Y) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  S[i] = A * X[i] + Y[i];
}

HIP

#include <hip/hip_runtime.h>

extern "C" __global__ void saxpy(float* S, float A, float* X, float* Y) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  S[i] = A * X[i] + Y[i];
}

OpenCL

__kernel void saxpy(__global float* S, float A, __global float* X, __global float* Y) {
  int i = get_global_id(0);
  S[i] = A * X[i] + Y[i];
}

OpenMP

#include <iris/iris_openmp.h>

static void saxpy(float* S, float A, float* X, float* Y, IRIS_OPENMP_KERNEL_ARGS) {
  int i = 0;
#pragma omp parallel for shared(S, A, X, Y) private(i)
  IRIS_OPENMP_KERNEL_BEGIN(i)
  S[i] = A * X[i] + Y[i];
  IRIS_OPENMP_KERNEL_END
}

Hexagon

#include <iris/iris_hexagon_imp.h>

AEEResult irishexagon_saxpy(float* S, int Slen, float A, float* X, int Xlen, float* Y, int Ylen, IRIS_HEXAGON_KERNEL_ARGS) {
  int32 i = 0;
  IRIS_HEXAGON_KERNEL_BEGIN(i)
  S[i] = A * X[i] + Y[i];
  IRIS_HEXAGON_KERNEL_END
  return AEE_SUCCESS;
}