Hello World
Setting the following environment variables is highly recommended to make life easier.
$ export IRIS=<install_path> # install_path would be $HOME/.local
$ export CPATH=$CPATH:$IRIS/include
$ export LIBRARY_PATH=$LIBRARY_PATH:$IRIS/lib:$IRIS/lib64
$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$IRIS/lib:$IRIS/lib64
$ export PYTHONPATH=$PYTHONPATH:$IRIS/include
The “Hello World” program is the first step towards learning IRIS. This program displays the message “HELLO WORLD” on the screen.
$ cd iris/apps/helloworld
$ make
$ ./helloworld
HELLO WORLD
$
Host Code
C
#include <iris/iris.h>
#include <stdio.h>
char a[12] = "hello world";
char b[12];
size_t size = 12;
int main(int argc, char** argv) {
iris_init(&argc, &argv, 1);
iris_mem mem_a;
iris_mem mem_b;
iris_mem_create(size, &mem_a);
iris_mem_create(size, &mem_b);
iris_task task;
iris_task_create(&task);
iris_task_h2d(task, mem_a, 0, size, a);
void* params[2] = { mem_b, mem_a };
int params_info[2] = { iris_w, iris_r };
iris_task_kernel(task, "uppercase", 1, NULL, &size, NULL, 2, params, params_info);
iris_task_d2h(task, mem_b, 0, size, b);
iris_task_submit(task, iris_roundrobin, NULL, 1);
printf("%s\n", b);
iris_task_release(task);
iris_mem_release(mem_a);
iris_mem_release(mem_b);
iris_finalize();
return 0;
}
C++
#include <iris/iris.hpp>
#include <stdio.h>
char a[12] = "hello world";
char b[12];
size_t size = 12;
int main(int argc, char** argv) {
iris::Platform platform;
platform.init(&argc, &argv, true);
iris::Mem mem_a(size);
iris::Mem mem_b(size);
iris::Task task;
task.h2d(&mem_a, 0, size, a);
void* params[2] = { &mem_b, &mem_a };
int params_info[2] = { iris_w, iris_r };
task.kernel("uppercase", 1, NULL, &size, NULL, 2, params, params_info);
task.d2h(&mem_b, 0, size, b);
task.submit(iris_roundrobin, NULL, true);
printf("%s\n", b);
platform.finalize();
return 0;
}
Kernels
CUDA
extern "C" __global__ void uppercase(char* b, char* a) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
else b[i] = a[i];
}
HIP
#include <hip/hip_runtime.h>
extern "C" __global__ void uppercase(char* b, char* a) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
else b[i] = a[i];
}
OpenCL
__kernel void uppercase(__global char* b, __global char* a) {
int i = get_global_id(0);
if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
else b[i] = a[i];
}
OpenMP
#include <iris/iris_openmp.h>
static void uppercase(char* b, char* a, IRIS_OPENMP_KERNEL_ARGS) {
int i = 0;
#pragma omp parallel for shared(b, a) private(i)
IRIS_OPENMP_KERNEL_BEGIN(i)
if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
else b[i] = a[i];
IRIS_OPENMP_KERNEL_END
}
Hexagon
#include <iris/iris_hexagon_imp.h>
AEEResult irishexagon_uppercase(char* b, int blen, char* a, int alen, IRIS_HEXAGON_KERNEL_ARGS) {
int32 i = 0;
IRIS_HEXAGON_KERNEL_BEGIN(i)
if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
else b[i] = a[i];
IRIS_HEXAGON_KERNEL_END
return AEE_SUCCESS;
}
SAXPY
SAXPY stands for “Single-precision A * X Plus Y”. It is a combination of scalar multiplication and vector addition.
$ cd iris/apps/saxpy
$ make
$ ./saxpy-c
X [ 0. 1. 2. 3. 4. 5. 6. 7.]
Y [ 0. 1. 2. 3. 4. 5. 6. 7.]
S = 10.000000 * X + Y [ 0. 11. 22. 33. 44. 55. 66. 77.]
$
Host Code
C
#include <iris/iris.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
int main(int argc, char** argv) {
iris_init(&argc, &argv, 1);
size_t SIZE;
int TARGET;
int VERBOSE;
float *X, *Y, *Z;
float A = 10;
int ERROR = 0;
SIZE = argc > 1 ? atol(argv[1]) : 8;
TARGET = argc > 2 ? atol(argv[2]) : 0;
VERBOSE = argc > 3 ? atol(argv[3]) : 1;
printf("[%s:%d] SIZE[%zu] TARGET[%d] VERBOSE[%d]\n", __FILE__, __LINE__, SIZE, TARGET, VERBOSE);
X = (float*) malloc(SIZE * sizeof(float));
Y = (float*) malloc(SIZE * sizeof(float));
Z = (float*) malloc(SIZE * sizeof(float));
if (VERBOSE) {
for (int i = 0; i < SIZE; i++) {
X[i] = i;
Y[i] = i;
}
printf("X [");
for (int i = 0; i < SIZE; i++) printf(" %2.0f.", X[i]);
printf("]\n");
printf("Y [");
for (int i = 0; i < SIZE; i++) printf(" %2.0f.", Y[i]);
printf("]\n");
}
iris_mem mem_X;
iris_mem mem_Y;
iris_mem mem_Z;
iris_mem_create(SIZE * sizeof(float), &mem_X);
iris_mem_create(SIZE * sizeof(float), &mem_Y);
iris_mem_create(SIZE * sizeof(float), &mem_Z);
iris_task task0;
iris_task_create(&task0);
iris_task_h2d_full(task0, mem_X, X);
iris_task_h2d_full(task0, mem_Y, Y);
void* saxpy_params[4] = { mem_Z, &A, mem_X, mem_Y };
int saxpy_params_info[4] = { iris_w, sizeof(A), iris_r, iris_r };
iris_task_kernel(task0, "saxpy", 1, NULL, &SIZE, NULL, 4, saxpy_params, saxpy_params_info);
iris_task_d2h_full(task0, mem_Z, Z);
iris_task_submit(task0, TARGET, NULL, 1);
if (VERBOSE) {
for (int i = 0; i < SIZE; i++) {
if (Z[i] != A * X[i] + Y[i]) ERROR++;
}
printf("S = %f * X + Y [", A);
for (int i = 0; i < SIZE; i++) printf(" %3.0f.", Z[i]);
printf("]\n");
}
iris_mem_release(mem_X);
iris_mem_release(mem_Y);
iris_mem_release(mem_Z);
free(X);
free(Y);
free(Z);
iris_task_release(task0);
iris_finalize();
return 0;
}
C++
#include <iris/iris.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
int main(int argc, char** argv) {
iris::Platform platform;
platform.init(&argc, &argv, 1);
size_t SIZE;
float *X, *Y, *Z;
float A = 10;
int ERROR = 0;
int nteams = 8;
int chunk_size = SIZE / nteams;
SIZE = argc > 1 ? atol(argv[1]) : 8;
X = (float*) malloc(SIZE * sizeof(float));
Y = (float*) malloc(SIZE * sizeof(float));
Z = (float*) malloc(SIZE * sizeof(float));
for (int i = 0; i < SIZE; i++) {
X[i] = i;
Y[i] = i;
}
printf("X [");
for (int i = 0; i < SIZE; i++) printf(" %2.0f.", X[i]);
printf("]\n");
printf("Y [");
for (int i = 0; i < SIZE; i++) printf(" %2.0f.", Y[i]);
printf("]\n");
iris::Mem mem_X(SIZE * sizeof(float));
iris::Mem mem_Y(SIZE * sizeof(float));
iris::Mem mem_Z(SIZE * sizeof(float));
iris::Task task;
task.h2d_full(&mem_X, X);
task.h2d_full(&mem_Y, Y);
void* params0[4] = { &mem_Z, &A, &mem_X, &mem_Y };
int pinfo0[4] = { iris_w, sizeof(A), iris_r, iris_r };
task.kernel("saxpy", 1, NULL, &SIZE, NULL, 4, params0, pinfo0);
task.d2h_full(&mem_Z, Z);
task.submit(1, NULL, 1);
for (int i = 0; i < SIZE; i++) {
if (Z[i] != A * X[i] + Y[i]) ERROR++;
}
printf("S = %f * X + Y [", A);
for (int i = 0; i < SIZE; i++) printf(" %3.0f.", Z[i]);
printf("]\n");
free(X);
free(Y);
free(Z);
platform.finalize();
return 0;
}
Fortran
PROGRAM SAXPY
USE IRIS
IMPLICIT NONE
INTEGER :: I, IERROR
INTEGER(8) :: SIZE
REAL(4),DIMENSION(:),ALLOCATABLE :: Z
REAL(4),DIMENSION(:),ALLOCATABLE :: X
REAL(4),DIMENSION(:),ALLOCATABLE :: Y
REAL(4) :: A
INTEGER(8) :: MEM_Z
INTEGER(8) :: MEM_X
INTEGER(8) :: MEM_Y
INTEGER(8),DIMENSION(3) :: OFF
INTEGER(8),DIMENSION(3) :: GWS
INTEGER(8),DIMENSION(3) :: LWS
INTEGER :: NPARAMS
INTEGER(8),DIMENSION(4) :: PARAMS
INTEGER,DIMENSION(4) :: PARAMS_INFO
INTEGER(8) :: TASK
CALL IRIS_INIT(.TRUE., IERROR)
IF (IERROR /= IRIS_SUCCESS) THEN
PRINT*, 'FAILED AT INIT'
ENDIF
SIZE = 8
ALLOCATE(Z(SIZE))
ALLOCATE(X(SIZE))
ALLOCATE(Y(SIZE))
A = 10.0
DO I = 1, SIZE
X(I) = I
Y(I) = I
ENDDO
DO I = 1, SIZE
PRINT*, 'X[', I, '] ', X(I)
ENDDO
PRINT*, '==='
DO I = 1, SIZE
PRINT*, 'Y[', I, '] ', Y(I)
ENDDO
CALL IRIS_MEM_CREATE(4 * SIZE, MEM_X, IERROR)
CALL IRIS_MEM_CREATE(4 * SIZE, MEM_Y, IERROR)
CALL IRIS_MEM_CREATE(4 * SIZE, MEM_Z, IERROR)
CALL IRIS_TASK_CREATE(TASK, IERROR)
OFF(1) = 0
GWS(1) = SIZE
LWS(1) = SIZE
NPARAMS = 4
PARAMS = (/ MEM_Z, TRANSFER(A, TASK), MEM_X, MEM_Y /)
PARAMS_INFO = (/ IRIS_RW, 4, IRIS_R, IRIS_R /)
CALL IRIS_TASK_H2D_FULL(TASK, MEM_X, X, IERROR)
CALL IRIS_TASK_H2D_FULL(TASK, MEM_Y, Y, IERROR)
CALL IRIS_TASK_KERNEL(TASK, "saxpy", 1, OFF, GWS, LWS, &
NPARAMS, PARAMS, PARAMS_INFO, IERROR)
CALL IRIS_TASK_D2H_FULL(TASK, MEM_Z, Z, IERROR)
CALL IRIS_TASK_SUBMIT(TASK, IRIS_GPU, .TRUE., IERROR)
DO I = 1, SIZE
PRINT*, 'Z[', I, '] ', Z(I)
ENDDO
DEALLOCATE(X)
DEALLOCATE(Y)
DEALLOCATE(Z)
CALL IRIS_FINALIZE(IERROR)
END PROGRAM SAXPY
Python
#!/usr/bin/env python3
import iris
import numpy as np
import sys
iris.init()
SIZE = 8 if len(sys.argv) == 1 else int(sys.argv[1])
A = 10.0
x = np.arange(SIZE, dtype=np.float32)
y = np.arange(SIZE, dtype=np.float32)
s = np.arange(SIZE, dtype=np.float32)
print('X', x)
print('Y', y)
mem_x = iris.mem(x.nbytes)
mem_y = iris.mem(y.nbytes)
mem_s = iris.mem(s.nbytes)
task = iris.task()
task.h2d_full(mem_x, x)
task.h2d_full(mem_y, y)
task.kernel("saxpy", 1, [], [SIZE], [], [mem_s, A, mem_x, mem_y] , [iris.iris_w, 4, iris.iris_r, iris.iris_r] )
task.d2h_full(mem_s, s)
task.submit(iris.iris_gpu)
print('S =', A, '* X + Y', s)
iris.finalize()
Kernels
CUDA
extern "C" __global__ void saxpy(float* S, float A, float* X, float* Y) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
S[i] = A * X[i] + Y[i];
}
HIP
#include <hip/hip_runtime.h>
extern "C" __global__ void saxpy(float* S, float A, float* X, float* Y) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
S[i] = A * X[i] + Y[i];
}
OpenCL
__kernel void saxpy(__global float* S, float A, __global float* X, __global float* Y) {
int i = get_global_id(0);
S[i] = A * X[i] + Y[i];
}
OpenMP
#include <iris/iris_openmp.h>
static void saxpy(float* S, float A, float* X, float* Y, IRIS_OPENMP_KERNEL_ARGS) {
int i = 0;
#pragma omp parallel for shared(S, A, X, Y) private(i)
IRIS_OPENMP_KERNEL_BEGIN(i)
S[i] = A * X[i] + Y[i];
IRIS_OPENMP_KERNEL_END
}
Hexagon
#include <iris/iris_hexagon_imp.h>
AEEResult irishexagon_saxpy(float* S, int Slen, float A, float* X, int Xlen, float* Y, int Ylen, IRIS_HEXAGON_KERNEL_ARGS) {
int32 i = 0;
IRIS_HEXAGON_KERNEL_BEGIN(i)
S[i] = A * X[i] + Y[i];
IRIS_HEXAGON_KERNEL_END
return AEE_SUCCESS;
}