Hello World
Setting the following environment variables is highly recommended to make life easier.
$ source <install_path>/setup.source # default install_path would be $HOME/.iris
The “Hello World” program is the first step towards learning IRIS. This program displays the message “HELLO WORLD” on the screen.
$ cd iris/apps/helloworld
$ make
$ ./helloworld
HELLO WORLD
$
Host Code
C
#include <iris/iris.h>
#include <stdio.h>
char a[12] = "hello world";
char b[12];
size_t size = 12;
int main(int argc, char** argv) {
iris_init(&argc, &argv, 1);
iris_mem mem_a;
iris_mem mem_b;
iris_mem_create(size, &mem_a);
iris_mem_create(size, &mem_b);
iris_task task;
iris_task_create(&task);
iris_task_h2d(task, mem_a, 0, size, a);
void* params[2] = { &mem_b, &mem_a };
int params_info[2] = { iris_w, iris_r };
iris_task_kernel(task, "uppercase", 1, NULL, &size, NULL, 2, params, params_info);
iris_task_d2h(task, mem_b, 0, size, b);
iris_task_submit(task, iris_roundrobin, NULL, 1);
printf("%s\n", b);
iris_task_release(task);
iris_mem_release(mem_a);
iris_mem_release(mem_b);
iris_finalize();
return 0;
}
C++
#include <iris/iris.hpp>
#include <stdio.h>
char a[12] = "hello world";
char b[12];
size_t size = 12;
int main(int argc, char** argv) {
iris::Platform platform;
platform.init(&argc, &argv, true);
iris::Mem mem_a(size);
iris::Mem mem_b(size);
iris::Task task;
task.h2d(&mem_a, 0, size, a);
void* params[2] = { &mem_b, &mem_a };
int params_info[2] = { iris_w, iris_r };
task.kernel("uppercase", 1, NULL, &size, NULL, 2, params, params_info);
task.d2h(&mem_b, 0, size, b);
task.submit(iris_roundrobin, NULL, true);
printf("%s\n", b);
platform.finalize();
return 0;
}
Kernels
CUDA
extern "C" __global__ void uppercase(char* b, char* a) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
else b[i] = a[i];
}
HIP
#include <hip/hip_runtime.h>
extern "C" __global__ void saxpy(float* Z, float A, float* X, float* Y) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
Z[id] = A * X[id] + Y[id];
}
extern "C" __global__ void saxpy_with_offsets(float* Z, float A, float* X, float* Y, size_t blockOff_x, size_t blockOff_y, size_t blockOff_z) {
size_t id = (blockOff_x + blockIdx.x) * blockDim.x + threadIdx.x;
Z[id] = A * X[id] + Y[id];
}
OpenCL
__kernel void uppercase(__global char* b, __global char* a) {
int i = get_global_id(0);
if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
else b[i] = a[i];
}
OpenMP
#include <iris/iris_openmp.h>
static void uppercase(char* b, char* a, IRIS_OPENMP_KERNEL_ARGS) {
int i;
#pragma omp parallel for shared(b, a) private(i)
IRIS_OPENMP_KERNEL_BEGIN(i)
if (a[i] >= 'a' && a[i] <= 'z') b[i] = a[i] + 'A' - 'a';
else b[i] = a[i];
IRIS_OPENMP_KERNEL_END
}
SAXPY
SAXPY stands for “Single-precision A * X Plus Y”. It is a combination of scalar multiplication and vector addition.
$ cd iris/apps/saxpy
$ make
$ ./saxpy-c
X [ 0. 1. 2. 3. 4. 5. 6. 7.]
Y [ 0. 1. 2. 3. 4. 5. 6. 7.]
S = 10.000000 * X + Y [ 0. 11. 22. 33. 44. 55. 66. 77.]
$
Host Code
C
#include <iris/iris.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
int main(int argc, char** argv) {
iris_init(&argc, &argv, 1);
size_t SIZE;
int TARGET;
int VERBOSE;
float *X, *Y, *Z;
float A = 10;
int ERROR = 0;
SIZE = argc > 1 ? atol(argv[1]) : 8;
TARGET = argc > 2 ? atol(argv[2]) : 0;
VERBOSE = argc > 3 ? atol(argv[3]) : 1;
printf("[%s:%d] SIZE[%zu] TARGET[%d] VERBOSE[%d]\n", __FILE__, __LINE__, SIZE, TARGET, VERBOSE);
X = (float*) malloc(SIZE * sizeof(float));
Y = (float*) malloc(SIZE * sizeof(float));
Z = (float*) malloc(SIZE * sizeof(float));
if (VERBOSE) {
for (int i = 0; i < SIZE; i++) {
X[i] = i;
Y[i] = i;
}
printf("X [");
for (int i = 0; i < SIZE; i++) printf(" %2.0f.", X[i]);
printf("]\n");
printf("Y [");
for (int i = 0; i < SIZE; i++) printf(" %2.0f.", Y[i]);
printf("]\n");
}
iris_mem mem_X;
iris_mem mem_Y;
iris_mem mem_Z;
iris_mem_create(SIZE * sizeof(float), &mem_X);
iris_mem_create(SIZE * sizeof(float), &mem_Y);
iris_mem_create(SIZE * sizeof(float), &mem_Z);
iris_task task0;
iris_task_create(&task0);
iris_task_h2d_full(task0, mem_X, X);
iris_task_h2d_full(task0, mem_Y, Y);
void* saxpy_params[4] = { &mem_Z, &A, &mem_X, &mem_Y };
int saxpy_params_info[4] = { iris_w, sizeof(A), iris_r, iris_r };
iris_task_kernel(task0, "saxpy", 1, NULL, &SIZE, NULL, 4, saxpy_params, saxpy_params_info);
iris_task_d2h_full(task0, mem_Z, Z);
iris_task_submit(task0, TARGET, NULL, 1);
if (VERBOSE) {
for (int i = 0; i < SIZE; i++) {
//printf("[%8d] %8.1f = %4.0f * %8.1f + %8.1f\n", i, Z[i], A, X[i], Y[i]);
if (Z[i] != A * X[i] + Y[i]) ERROR++;
}
printf("S = %f * X + Y [", A);
for (int i = 0; i < SIZE; i++) printf(" %3.0f.", Z[i]);
printf("]\n");
}
iris_mem_release(mem_X);
iris_mem_release(mem_Y);
iris_mem_release(mem_Z);
free(X);
free(Y);
free(Z);
//iris_task_release(task0);
iris_finalize();
return 0;
}
C++
#include <iris/iris.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
int main(int argc, char** argv) {
iris::Platform platform;
platform.init(&argc, &argv, 1);
size_t SIZE;
float *X, *Y, *Z;
float A = 10;
int ERROR = 0;
//int nteams = 8;
SIZE = argc > 1 ? atol(argv[1]) : 8;
//size_t chunk_size = SIZE / nteams;
X = (float*) malloc(SIZE * sizeof(float));
Y = (float*) malloc(SIZE * sizeof(float));
Z = (float*) malloc(SIZE * sizeof(float));
for (int i = 0; i < SIZE; i++) {
X[i] = i;
Y[i] = i;
}
printf("X [");
for (int i = 0; i < SIZE; i++) printf(" %2.0f.", X[i]);
printf("]\n");
printf("Y [");
for (int i = 0; i < SIZE; i++) printf(" %2.0f.", Y[i]);
printf("]\n");
#if 0
iris::Mem mem_X(SIZE * sizeof(float));
iris::Mem mem_Y(SIZE * sizeof(float));
iris::Mem mem_Z(SIZE * sizeof(float));
iris::Task task;
task.h2d_full(&mem_X, X);
task.h2d_full(&mem_Y, Y);
void* params0[4] = { &mem_Z, &A, &mem_X, &mem_Y };
int pinfo0[4] = { iris_w, sizeof(A), iris_r, iris_r };
task.kernel("saxpy", 1, NULL, &SIZE, NULL, 4, params0, pinfo0);
task.d2h_full(&mem_Z, Z);
task.submit(1, NULL, 1);
#else
iris::DMem mem_X(X, SIZE * sizeof(float));
iris::DMem mem_Y(Y, SIZE * sizeof(float));
iris::DMem mem_Z(Z, SIZE * sizeof(float));
iris::Task task;
void* params0[4] = { &mem_Z, &A, &mem_X, &mem_Y };
int pinfo0[4] = { iris_w, sizeof(A), iris_r, iris_r };
task.kernel("saxpy", 1, NULL, &SIZE, NULL, 4, params0, pinfo0);
task.flush_out(mem_Z);
task.submit(1, NULL, 1);
#endif
for (int i = 0; i < SIZE; i++) {
//printf("[%8d] %8.1f = %4.0f * %8.1f + %8.1f\n", i, Z[i], A, X[i], Y[i]);
if (Z[i] != A * X[i] + Y[i]) ERROR++;
}
printf("S = %f * X + Y [", A);
for (int i = 0; i < SIZE; i++) printf(" %3.0f.", Z[i]);
printf("]\n");
free(X);
free(Y);
free(Z);
platform.finalize();
return 0;
}
Fortran
PROGRAM SAXPY
USE IRIS
IMPLICIT NONE
INTEGER :: I, IERROR
INTEGER(8) :: SIZE
REAL(4),DIMENSION(:),ALLOCATABLE :: Z
REAL(4),DIMENSION(:),ALLOCATABLE :: X
REAL(4),DIMENSION(:),ALLOCATABLE :: Y
REAL(4) :: A
INTEGER(8) :: MEM_Z
INTEGER(8) :: MEM_X
INTEGER(8) :: MEM_Y
INTEGER(8),DIMENSION(3) :: OFF
INTEGER(8),DIMENSION(3) :: GWS
INTEGER(8),DIMENSION(3) :: LWS
INTEGER :: NPARAMS
INTEGER(8),DIMENSION(4) :: PARAMS
INTEGER,DIMENSION(4) :: PARAMS_INFO
INTEGER(8) :: TASK
CALL IRIS_INIT(.TRUE., IERROR)
IF (IERROR /= IRIS_SUCCESS) THEN
PRINT*, 'FAILED AT INIT'
ENDIF
SIZE = 8
ALLOCATE(Z(SIZE))
ALLOCATE(X(SIZE))
ALLOCATE(Y(SIZE))
A = 10.0
DO I = 1, SIZE
X(I) = I
Y(I) = I
ENDDO
DO I = 1, SIZE
PRINT*, 'X[', I, '] ', X(I)
ENDDO
PRINT*, '==='
DO I = 1, SIZE
PRINT*, 'Y[', I, '] ', Y(I)
ENDDO
CALL IRIS_MEM_CREATE(4 * SIZE, MEM_X, IERROR)
CALL IRIS_MEM_CREATE(4 * SIZE, MEM_Y, IERROR)
CALL IRIS_MEM_CREATE(4 * SIZE, MEM_Z, IERROR)
CALL IRIS_TASK_CREATE(TASK, IERROR)
OFF(1) = 0
GWS(1) = SIZE
LWS(1) = SIZE
NPARAMS = 4
PARAMS = (/ MEM_Z, TRANSFER(A, TASK), MEM_X, MEM_Y /)
PARAMS_INFO = (/ IRIS_RW, 4, IRIS_R, IRIS_R /)
CALL IRIS_TASK_H2D_FULL(TASK, MEM_X, X, IERROR)
CALL IRIS_TASK_H2D_FULL(TASK, MEM_Y, Y, IERROR)
CALL IRIS_TASK_KERNEL(TASK, "saxpy", 1, OFF, GWS, LWS, &
NPARAMS, PARAMS, PARAMS_INFO, IERROR)
CALL IRIS_TASK_D2H_FULL(TASK, MEM_Z, Z, IERROR)
CALL IRIS_TASK_SUBMIT(TASK, IRIS_GPU, .TRUE., IERROR)
DO I = 1, SIZE
PRINT*, 'Z[', I, '] ', Z(I)
ENDDO
DEALLOCATE(X)
DEALLOCATE(Y)
DEALLOCATE(Z)
CALL IRIS_FINALIZE(IERROR)
END PROGRAM SAXPY
Python
#!/usr/bin/env python3
import iris
import numpy as np
import sys
iris.init()
SIZE = 8 if len(sys.argv) == 1 else int(sys.argv[1])
A = 10.0
x = np.arange(SIZE, dtype=np.float32)
y = np.arange(SIZE, dtype=np.float32)
s = np.arange(SIZE, dtype=np.float32)
print('X', x)
print('Y', y)
old_way = True
disable = True
if not disable:
if old_way:
mem_x = iris.mem(x.nbytes)
mem_y = iris.mem(y.nbytes)
mem_s = iris.mem(s.nbytes)
task = iris.task()
task.h2d_full(mem_x, x)
task.h2d_full(mem_y, y)
task.kernel("saxpy", 1, [], [SIZE], [], [mem_s, A, mem_x, mem_y] , [iris.iris_w, 4, iris.iris_r, iris.iris_r] )
task.d2h_full(mem_s, s)
task.submit(iris.iris_gpu)
else:
# New DMEM way
task = iris.task("saxpy", 1, [], [SIZE], [], [
(s, iris.iris_w, iris.iris_flush),
A,
(x, iris.iris_r),
(y, iris.iris_r)
])
task.submit(iris.iris_gpu)
print('S =', A, '* X + Y', s)
s0 = np.arange(SIZE, dtype=np.float32)
s1 = np.arange(SIZE, dtype=np.float32)
mem_x = iris.dmem(x)
mem_y = iris.dmem(y)
#mem_s0 = iris.dmem(s0)
mem_s0 = iris.dmem_null(s0.nbytes)
mem_s1 = iris.dmem(s1)
task0 = iris.task("saxpy", 1, [], [SIZE], [], [
(mem_s0, iris.iris_w),
A,
(mem_x, iris.iris_r),
(mem_y, iris.iris_r)
])
task1 = iris.task("saxpy", 1, [], [SIZE], [], [
(mem_s1, iris.iris_w, iris.iris_flush),
A,
(mem_s0, iris.iris_r),
(mem_s0, iris.iris_r)
])
task1.depends(task0)
#task0.submit(iris.iris_gpu)
#task1.submit(iris.iris_gpu)
#print('S =', A, '* X + Y', x, y, s0)
#print('S =', A, '* X + Y', s0, s0, s1)
x = x*100
mem_x.update(x)
graph = iris.graph([task0, task1])
graph.submit()
graph.wait()
print('S =', A, '* X + Y', x, y, s0)
print('S =', A, '* X + Y', s0, s0, s1)
x = x*100
mem_x.update(x)
graph.submit()
graph.wait()
ntasks, tasks = graph.get_tasks()
print('S =', A, '* X + Y', x, y, s0)
print('S =', A, '* X + Y', s0, s0, s1)
iris.finalize()
Kernels
CUDA
extern "C" __global__ void saxpy(float* Z, float A, float* X, float* Y) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
Z[id] = A * X[id] + Y[id];
}
HIP
#include <hip/hip_runtime.h>
extern "C" __global__ void saxpy(float* Z, float A, float* X, float* Y) {
size_t id = blockIdx.x * blockDim.x + threadIdx.x;
Z[id] = A * X[id] + Y[id];
}
OpenCL
__kernel void saxpy(__global float* restrict Z, float A, __global float* restrict X, __global float* restrict Y) {
size_t id = get_global_id(0);
Z[id] = A * X[id] + Y[id];
}
OpenMP
#include <iris/iris_openmp.h>
static void saxpy(float* Z, float A, float* X, float* Y, IRIS_OPENMP_KERNEL_ARGS) {
size_t i;
#pragma omp parallel for shared(Z, A, X, Y) private(i)
IRIS_OPENMP_KERNEL_BEGIN(i)
Z[i] = A * X[i] + Y[i];
IRIS_OPENMP_KERNEL_END
}
Data Memory
One of the major benefits of using IRIS is its “data memory” feature, which automatically manage data movement independent of the device scheduling. Here is an example of the use of data memory during a vector addition code. Note how the:
iris_data_mem_create(&mem_A, A, SIZE * sizeof(int));
...
iris_task_dmem_flush_out(task0,mem_C);
call differs from the SAXPY example above. We no longer need iris_task_h2d_full
and iris_task_d2h_full
calls, instead, we only need to know when to flush the final memory transfer required by the host. This is a simpler work-flow that the conventional explicit memory movement approach.
Running
$ cd iris/apps/vecadd
$ make
$ ./vecadd-iris
Host Code
C++
#include <iris/iris.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
int main(int argc, char** argv) {
size_t SIZE;
int *A, *B, *C;
int ERROR = 0;
iris_init(&argc, &argv, true);
SIZE = argc > 1 ? atol(argv[1]) : 16;
printf("SIZE[%lu]\n", SIZE);
A = (int*) valloc(SIZE * sizeof(int));
B = (int*) valloc(SIZE * sizeof(int));
C = (int*) valloc(SIZE * sizeof(int));
for (int i = 0; i < SIZE; i++) {
A[i] = i;
B[i] = i;
C[i] = 0;
}
iris_mem mem_A;
iris_mem mem_B;
iris_mem mem_C;
iris_data_mem_create(&mem_A, A, SIZE * sizeof(int));
iris_data_mem_create(&mem_B, B, SIZE * sizeof(int));
iris_data_mem_create(&mem_C, C, SIZE * sizeof(int));
iris_task task0;
iris_task_create(&task0);
void* params0[3] = { &mem_A, &mem_B, &mem_C };
int pinfo0[3] = { iris_r, iris_r, iris_w };
iris_task_kernel(task0, "vecadd", 1, NULL, &SIZE, NULL, 3, params0, pinfo0);
iris_task_dmem_flush_out(task0,mem_C);
iris_task_submit(task0, iris_any,nullptr, true);
iris_synchronize();
for (int i = 0; i < SIZE; i++) {
printf("C[%d] = %d\n", i, C[i]);
if (C[i] != (A[i] + B[i])) ERROR++;
}
iris_finalize();
printf("ERROR[%d]\n", ERROR+iris_error_count());
return ERROR+iris_error_count();
}
Kernels
CUDA
extern "C" __global__ void vecadd(int* A, int* B, int* C) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
C[id] = A[id] + B[id];
}
HIP
#include <hip/hip_runtime.h>
extern "C" __global__ void vecadd(int* A, int* B, int* C) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
C[i] = A[i] + B[i];
}
OpenCL
__kernel void vecadd(__global int* restrict A, __global int* restrict B, __global int* restrict C) {
size_t id = get_global_id(0);
C[id] = A[id] + B[id];
}
OpenMP
#include <iris/iris_openmp.h>
static void vecadd(int* A, int* B, int* C, IRIS_OPENMP_KERNEL_ARGS) {
int i;
#pragma omp parallel for shared(C, A, B) private(i)
IRIS_OPENMP_KERNEL_BEGIN(i)
C[i] = A[i] + B[i];
IRIS_OPENMP_KERNEL_END
}
Device Selection
IRIS opportunistically attempts to use all available devices and backends, it resolves task names to function names in the corresponding kernel binaries. It allows device selection to be set both at compile and at runtime.
Compile Time
The user can submit the device target(s) for when the task is submitted:
iris_task_submit(iris_task task, int device, const char* opt, int sync);
This task submission includes information about the task, such as a hint, target device parameter, synchronization mode (blocking or non-blocking), and policy selector that indicates where the task should be executed.
The device
is the device submission policy. The complete list of available targets are:
Device Policy |
About |
---|---|
iris_cpu |
Submit the task to a CPU device |
iris_gpu |
Submit the task to any GPU device |
iris_fpga |
Submit the task to any FPGA (currently Intel and Xilinx) |
iris_dsp |
Submit the task to any DSP device (currently Hexagon) |
iris_nvidia |
Submit the task to an NVIDIA GPU device |
iris_amd |
Submit the task to an AMD GPU device |
iris_gpu_intel |
Submit the task to an Intel GPU device |
iris_phi |
Submit the task to an Intel Xeon Phi device |
We can also submit tasks according to a scheduling policy:
Scheduling Policy |
About |
---|---|
iris_default |
Use the first device |
iris_roundrobin |
Submit this task in a round-robin (cyclic) way, for equal work sharing |
iris_depend |
Submit this task to a device that has been assigned its dependent |
iris_data |
Submit task to device to minimize data movement |
iris_profile |
Submit the task to the device based on execution time history |
iris_random |
Randomly assign this task to any of the available devices |
iris_pending |
Delay submitting the task until the memory it depends on has been assigned, then use that device |
iris_any |
Submit task to the device with the fewest assigned tasks |
iris_all |
Submit the task to all device queues, the device that accesses it first has exclusive execution (it is removed from the other device queues) |
iris_custom |
Submit the task based on a used provided, custom policy |
The opt
parameter is for iris_custom
policies.
Runtime
We can also filter out devices at runtime by setting the IRIS_ARCHS
environment variable. Modifying the selection of backends to instantiate allows dynamic device targets—without requiring recompilation. All current options are:hip
,``cuda``,``opencl``, and openmp
. An example of only allowing execution on openmp
and hip
devices would then be:
$ export IRIS_ARCHS=hip,openmp
$ ./helloworld
HELLO WORLD
$