Source code for iris

import ctypes
from ctypes import *
import sys
import numpy as np
import pdb
import os
import struct

[docs]class CommData3D(Structure): _fields_ = [ ("from_id", c_uint), ("to_id", c_uint), ("mem_id", c_uint), ("size", c_size_t) ] def __repr__(self): return f'CommData3D(from_id={self.from_id}, to_id={self.to_id}, mem_id={self.mem_id}, size={self.size})' def __str__(self): return f"from_id={self.from_id}, to_id={self.to_id}, mem_id={self.mem_id}, size={self.size}"
[docs]class TaskProfile(Structure): _fields_ = [ ("task", c_uint), ("device", c_uint), ("start", c_double), ("end", c_double) ] def __str__(self): return f"task={self.task}, device={self.device}, start={self.start}, end={self.end}" def __repr__(self) -> str: return f"TaskProfile(task={self.task}, device={self.device}, start={self.start}, end={self.end})"
[docs]class DataObjectProfile(Structure): _fields_ = [ ("task", c_uint), ("data_object", c_uint), ("datatransfer_type", c_uint), ("from_device", c_uint), ("device", c_uint), ("start", c_double), ("end", c_double) ] def __str__(self): return f"task={self.task}, data_object={self.data_object}, datatransfer_type={self.datatransfer_type}, from_device={self.from_device}, device={self.device}, start={self.start}, end={self.end}" def __repr__(self) -> str: return f"DataObjectProfile(task={self.task}, data_object={self.data_object}, datatransfer_type={self.datatransfer_type}, from_device={self.from_device}, device={self.device}, start={self.start}, end={self.end})"
[docs]class library(CDLL): def __init__(self, library_name, read_symbols=False): super().__init__(library_name, mode=RTLD_GLOBAL) def convert_c_pointer_to_numpy(self, cptr, shape, ct=ctypes.c_double): U = np.ctypeslib.as_array(ctypes.cast(cptr, ctypes.POINTER(ct)), shape=shape) return U def call_ret_ptr(self, fn_name, *args): conv_args = [] for each_arg in args: conv_args.append(convert_obj_ctype(each_arg)[0]) if type(fn_name) == str: fn_name = self.__getitem__(fn_name) fn_name.restype = POINTER(c_void_p) r = fn_name(*conv_args) return r def call_ret(self, fn_name, restype, *args): conv_args = [] for each_arg in args: conv_args.append(convert_obj_ctype(each_arg)[0]) if type(fn_name) == str: fn_name = self.__getitem__(fn_name) fn_name.restype = restype return fn_name(*conv_args) def call(self, fn_name, *args): conv_args = [] for each_arg in args: conv_args.append(convert_obj_ctype(each_arg)[0]) if type(fn_name) == str: fn_name = self.__getitem__(fn_name) fn_name(*conv_args)
[docs]class IRIS(library): def __init__(self): # Do not error on import when Iris is not built. This is needed for Read the Docs to create the API documentation. try: super(IRIS, self).__init__("libiris.so") except OSError as e: print(e) print('libiris.so not found. Please install IRIS library.', file=sys.stderr) def get_numpy_size(self, size): if type(size) != np.ndarray: size = np.array(size) return size def alloc_random_size_t(self, size, init=np.int64(0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_random_array_size_t, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_size_t) return np_data, c_ptr def alloc_random_float(self, size, init=np.float32(0.0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_random_array_float, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_float) return np_data, c_ptr def alloc_random_double(self, size, init=np.float64(0.0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_random_array_double, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_double) return np_data, c_ptr def alloc_random_int64(self, size, init=np.int64(0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_random_array_int64_t, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_int64) return np_data, c_ptr def alloc_random_int32(self, size, init=np.int32(0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_random_array_int32_t, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_int32) return np_data, c_ptr def alloc_random_int16(self, size, init=np.int16(0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_random_array_int16_t, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_short) return np_data, c_ptr def alloc_random_int8(self, size, init=np.int8(0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_random_array_int8_t, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_byte) return np_data, c_ptr def alloc_size_t(self, size, init=np.int64(0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_array_size_t, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_size_t) return np_data, c_ptr def alloc_float(self, size, init=np.float32(0.0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_array_float, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_float) return np_data, c_ptr def alloc_double(self, size, init=np.float64(0.0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_array_double, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_double) return np_data, c_ptr def alloc_int64(self, size, init=np.int64(0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_array_int64_t, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_int64) return np_data, c_ptr def alloc_int32(self, size, init=np.int32(0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_array_int32_t, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_int32) return np_data, c_ptr def alloc_int16(self, size, init=np.int16(0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_array_int16_t, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_short) return np_data, c_ptr def alloc_int8(self, size, init=np.int8(0)): size = self.get_numpy_size(size) total_size = size.prod() c_ptr = dll.call_ret_ptr(dll.iris_allocate_array_int8_t, np.int32(total_size), init) np_data = dll.convert_c_pointer_to_numpy(c_ptr, size, ctypes.c_byte) return np_data, c_ptr def free(self, c_ptr): dll.call(dll.iris_free_array, c_ptr)
dll = IRIS() def call(fn_name, *args): dll.call(fn_name, *args) def call_ret_ptr(fn_name, *args): return dll.call_ret_ptr(fn_name, *args) iris_ok = 0 iris_err = -1 iris_default = (1 << 5) iris_cpu = (1 << 6) iris_nvidia = (1 << 7) iris_amd = (1 << 8) iris_gpu_intel = (1 << 9) iris_gpu = (iris_nvidia | iris_amd | iris_gpu_intel) iris_phi = (1 << 10) iris_fpga = (1 << 11) iris_hexagon = (1 << 12) iris_dsp = (iris_hexagon) iris_roundrobin = (1 << 18) iris_depend = (1 << 19) iris_data = (1 << 20) iris_profile = (1 << 21) iris_random = (1 << 22) iris_pending = (1 << 23) iris_any = (1 << 24) iris_all = (1 << 25) iris_custom = (1 << 26) iris_r = -1 iris_w = -2 iris_rw = -3 iris_flush = -4 iris_int = (1 << 0) iris_long = (1 << 1) iris_float = (1 << 2) iris_double = (1 << 3) iris_normal = (1 << 10) iris_reduction = (1 << 11) iris_sum = ((1 << 12) | iris_reduction) iris_max = ((1 << 13) | iris_reduction) iris_min = ((1 << 14) | iris_reduction) iris_platform = 0x1001 iris_vendor = 0x1002 iris_name = 0x1003 iris_type = 0x1004 iris_dt_h2d = 1 iris_dt_h2o = 2 iris_dt_d2o = 3 iris_dt_d2h = 4 iris_dt_d2d = 5 iris_dt_d2h_h2d = 6 iris_dt_error = 0 IRIS_MEM = 0x1 IRIS_DMEM = 0x2 IRIS_DMEM_REGION = 0x4
[docs]class iris_kernel(Structure): _fields_ = [("class_obj", c_void_p), ("uid", c_ulong)]
[docs]class iris_mem(Structure): _fields_ = [("class_obj", c_void_p), ("uid", c_ulong)]
[docs]class iris_task(Structure): _fields_ = [("class_obj", c_void_p), ("uid", c_ulong)]
[docs]class iris_graph(Structure): _fields_ = [("class_obj", c_void_p), ("uid", c_ulong)]
def init(sync = 1): return dll.iris_init(0, None, c_int(sync)) def set_enable_profiler(flag=True): dll.iris_set_enable_profiler(int(flag)) def register_pin_memory(cptr, size): dll.iris_register_pin_memory(convert_obj_ctype(cptr)[0], convert_obj_ctype(size_t(size))[0]) def finalize(): return dll.iris_finalize() def synchronize(): return dll.iris_synchronize() def platform_count(): i = c_int() dll.iris_platform_count(byref(i)) return i.value def platform_info(platform, param): s = (c_char * 64)() dll.iris_platform_info(c_int(platform), c_int(param), s, None) return s.value def all_platform_info(): platform_names = [] for i in range(platform_count()): plt = platform_info(i, iris_name) platform_names.append(plt) return platform_names def device_count(): i = c_int() dll.iris_device_count(byref(i)) return i.value def device_info(device, param): s = (c_char * 64)() dll.iris_device_info(c_int(device), c_int(param), s, None) return s.value def calibrate_communication_cost_matrix(data_size, iterations=1, pin_memory=True, units='B'): print("Calibrate communication cost matrix from IRIS platform") ndevs = device_count()+1 data_np = np.zeros(ndevs*ndevs).astype(np.double) data_np_cp = convert_obj_ctype(data_np)[0] iterations = np.int32(iterations) dll.iris_calibrate_communication_cost(data_np_cp, convert_obj_ctype(size_t(data_size))[0], convert_obj_ctype(iterations)[0], convert_obj_ctype(np.int32(pin_memory))[0]) np.seterr(divide='ignore') out_data_np = data_size / data_np.reshape((ndevs, ndevs)) if units == 'KB': out_data_np = out_data_np / 1000 elif units == 'MB': out_data_np = out_data_np / 1000000 elif units == 'GB': out_data_np = out_data_np / 1000000000 elif units == 'TB': out_data_np = out_data_np / 1000000000000 elif units == 'PB': out_data_np = out_data_np / 1000000000000000 return out_data_np def all_device_info(): dev_names = [] for i in range(device_count()): dev = device_info(i, iris_name) dev_names.append(dev) return dev_names def mem_create(size): m = iris_mem() dll.iris_mem_create(c_size_t(size), byref(m)) return m def dmem_create(*args): if len(args) == 1: host = args[0][0] m = iris_mem() size = host.nbytes c_host = host.ctypes.data_as(c_void_p) dll.iris_data_mem_create(byref(m), c_host, c_size_t(size)) return m elif len(args) == 2: (root_mem, host) = args m = iris_mem() dll.iris_data_mem_enable_outer_dim_regions(m) dll.iris_data_mem_create_region(byref(m), root_mem, c_int(region)) return m else: (host, off, host_size, dev_size, elem_size, dim) = args m = iris_mem() c_host = host.ctypes.data_as(c_void_p) coff = (c_size_t * dim)(*off) chost_size = (c_size_t * dim)(*host_size) cdev_size = (c_size_t * dim)(*dev_size) size = host.nbytes dll.iris_data_mem_create_tile(byref(m), c_host, coff, chost_size, cdev_size, c_size_t(elem_size), c_int(dim)) return m def mem_reduce(mem, mode, type): return dll.iris_mem_reduce(mem, c_int(mode), c_int(type)) def mem_release(mem): return dll.iris_mem_release(mem) def kernel_create(name): k = iris_kernel() c_name = c_void_p(0) if name != None: c_name = c_char_p(name) if sys.version_info[0] == 2 else c_char_p(bytes(name, 'ascii')) dll.iris_kernel_create(c_name, byref(k)) return k def kernel_setarg(kernel, idx, size, value): if type(value) == int: cvalue = byref(c_int(value)) elif type(value) == float and size == 4: cvalue = byref(c_float(value)) elif type(value) == float and size == 8: cvalue = byref(c_double(value)) return dll.iris_kernel_setarg(kernel, c_int(idx), c_size_t(size), cvalue) def kernel_setmem(kernel, idx, mem, mode): return dll.iris_kernel_setmem(kernel, c_int(idx), mem, c_size_t(mode)) def kernel_release(kernel): return dll.iris_kernel_release(kernel) def graph_create(name = None): t = iris_graph() dll.iris_graph_create(byref(t)) return t def task_create(name = None): t = iris_task() c_name = c_void_p(0) if name != None: c_name = c_char_p(name) if sys.version_info[0] == 2 else c_char_p(bytes(name, 'ascii')) dll.iris_task_create_name(c_name, byref(t)) return t def task_depend(task, ntasks, tasks): return dll.iris_task_depend(task, c_int(ntasks), (iris_task * len(tasks))(*tasks)) def task_kernel(task, kernel, dim, off, gws, lws, params, params_info, hold_params): coff = (c_size_t * dim)(*off) cgws = (c_size_t * dim)(*gws) clws = (c_size_t * dim)(*lws) nparams = len(params) cparams = (c_void_p * nparams)() hold_params.clear() for i in range(nparams): if hasattr(params[i], 'handle') and isinstance(params[i].handle, iris_mem): cparams[i] = params[i].handle.class_obj elif isinstance(params[i], int) and params_info[i] == 4: p = byref(c_int(params[i])) hold_params.append(p) cparams[i] = cast(p, c_void_p) elif isinstance(params[i], float) and params_info[i] == 4: p = byref(c_float(params[i])) cparams[i] = cast(p, c_void_p) hold_params.append(p) elif isinstance(params[i], float) and params_info[i] == 8: p = byref(c_double(params[i])) cparams[i] = cast(p, c_void_p) hold_params.append(p) else: print("error") cparams_info = (c_int * nparams)(*params_info) kernel_name = c_char_p(kernel) if sys.version_info[0] == 2 else c_char_p(bytes(kernel, 'ascii')) return dll.iris_task_kernel(task, kernel_name, c_int(dim), coff, cgws, clws, c_int(nparams), cparams, cparams_info) def task_host(task, func, params): CWRAPPER = CFUNCTYPE(c_int, c_void_p, POINTER(c_int)) wrapped_py_func = CWRAPPER(func) nparams = len(params) cparams = (c_void_p * nparams)(*params) return dll.iris_task_host(task, cast(wrapped_py_func, c_void_p), cparams) def task_h2d(task, mem, off, size, host): return dll.iris_task_h2d(task, mem, c_size_t(off), c_size_t(size), host.ctypes.data_as(c_void_p)) def task_params_map(task, params_map_list): nparams = len(params_map_list) c_params_map_list = (c_int * nparams)(*params_map_list) return dll.iris_params_map(task, c_params_map_list) def task_d2h(task, mem, off, size, host): return dll.iris_task_d2h(task, mem, c_size_t(off), c_size_t(size), host.ctypes.data_as(c_void_p)) def task_h2d_full(task, mem, host): return dll.iris_task_h2d_full(task, mem, host.ctypes.data_as(c_void_p)) def task_d2h_full(task, mem, host): return dll.iris_task_d2h_full(task, mem, host.ctypes.data_as(c_void_p)) def task_submit(task, device, sync): return dll.iris_task_submit(task, c_int(device), None, c_int(sync)) def task_wait(task): return dll.iris_task_wait(task) def task_wait_all(ntask, tasks): return dll.iris_task_wait_all(c_int(ntask), (iris_task * len(tasks))(*tasks)) def task_add_subtask(task, subtask): return dll.iris_task_add_subtask(task, subtask) def task_release(task): return dll.iris_task_release(task) def task_release_mem(task, mem): return dll.iris_task_release_mem(task, mem) def timer_now(): d = c_double() dll.iris_timer_now(byref(d)) return d.value mem_handle_2_pobj = {} class dmem_null: def __init__(self, size): m = iris_mem() c_host = c_void_p(0) dll.iris_data_mem_create(byref(m), c_host, c_size_t(size)) self.handle = m mem_handle_2_pobj[self.handle.class_obj] = self def size(self): dll.iris_mem_get_size.restype = c_size_t return dll.iris_mem_get_size(self.handle) def update(self, host): c_host = host.ctypes.data_as(c_void_p) dll.iris_data_mem_update(self.handle, c_host) class dmem: def __init__(self, *args): self.handle = dmem_create(args) self.type = IRIS_DMEM mem_handle_2_pobj[self.handle.class_obj] = self def is_reset(self): dll.iris_mem_is_reset.restype = c_int return dll.iris_mem_is_reset(self.handle) def uid(self): dll.iris_mem_get_uid.restype = c_ulong return dll.iris_mem_get_uid(self.handle) def size(self): dll.iris_mem_get_size.restype = c_size_t return dll.iris_mem_get_size(self.handle) def update(self, host): c_host = host.ctypes.data_as(c_void_p) dll.iris_data_mem_update(self.handle, c_host) def get_region_uids(self): n_regions = dll.call_ret(dll.iris_data_mem_n_regions(self.handle), np.int32) output = [] for i in range(n_regions): output.append(dll.call_ret(dll.iris_data_mem_get_region_uid, np.uint32, self.handle, np.int32(i))) return output class dmem_region: def __init__(self, *args): self.handle = dmem_create(args) self.type = IRIS_DMEM_REGION self.fetch_dmem() def fetch_dmem(self, handle=None): if handle == None: handle = self.handle mem_handle_2_pobj[handle.class_obj] = self dll.iris_get_dmem_for_region.restype = iris_mem mem_obj = dll.iris_get_dmem_for_region(handle) if mem_obj.class_obj not in mem_handle_2_pobj: pmem = dmem.__new__(dmem) pmem.type = IRIS_DMEM pmem.handle = mem_obj mem_handle_2_pobj[mem_obj.class_obj] = pmem pmem = mem_handle_2_pobj[mem_obj.class_obj] uid = pmem.uid() self.mem = pmem def is_reset(self): dll.iris_mem_is_reset.restype = c_int return dll.iris_mem_is_reset(self.handle) def uid(self): dll.iris_mem_get_uid.restype = c_ulong return dll.iris_mem_get_uid(self.handle) def size(self): dll.iris_mem_get_size.restype = c_size_t return dll.iris_mem_get_size(self.handle) def update(self, host): c_host = host.ctypes.data_as(c_void_p) dll.iris_data_mem_update(self.handle, c_host) def mem(self): return self.mem class mem: def __init__(self, size): self.handle = mem_create(size) self.type = IRIS_MEM mem_handle_2_pobj[self.handle.class_obj] = self def is_reset(self): dll.iris_mem_is_reset.restype = c_int return dll.iris_mem_is_reset(self.handle) def uid(self): dll.iris_mem_get_uid.restype = c_ulong return dll.iris_mem_get_uid(self.handle) def size(self): dll.iris_mem_get_size.restype = c_size_t return dll.iris_mem_get_size(self.handle) def reduce(self, mode, type): mem_reduce(self.handle, mode, type) def release(self): mem_release(self.handle) kernel_handle_2_pobj = {} class kernel: def __init__(self, name): self.handle = kernel_create(name) kernel_handle_2_pobj[self.handle.class_obj] = self def name(self): dll.iris_kernel_get_name.restype = c_char_p kname = dll.iris_kernel_get_name(self.handle) return kname.decode('ascii') def uid(self): dll.iris_kernel_get_uid.restype = c_ulong return dll.iris_kernel_get_uid(self.handle) def setarg(self, idx, size, value): kernel_setarg(self.handle, idx, size, value) def setint(self, idx, value): kernel_setarg(self.handle, idx, 4, value) def setfloat(self, idx, value): kernel_setarg(self.handle, idx, 8, value) def setdouble(self, idx, value): kernel_setarg(self.handle, idx, 8, value) def setmem(self, idx, mem, mode): kernel_setmem(self.handle, idx, mem.handle, mode) def release(self): kernel_release(self.handle) class size_t: def __init__(self, value): self.value = value def get_scalar_conv(pobj): if isinstance(pobj, int) or type(pobj) == np.int32: return c_int(pobj) if type(pobj) == np.int64: return c_int64(pobj) if type(pobj) == np.double or type(pobj) == np.float64: return c_double(pobj) if isinstance(pobj, float) or type(pobj) == np.single or type(pobj) == np.float32: return c_float(pobj) if isinstance(pobj, bool): return c_bool(pobj) if type(pobj) == np.byte: return c_byte(pobj) if type(pobj) == np.ubyte: return c_ubyte(pobj) if type(pobj) == np.short: return c_short(pobj) if type(pobj) == np.ushort: return c_ushort(pobj) if type(pobj) == np.intc: return c_int(pobj) if type(pobj) == np.uintc: return c_uint(pobj) if type(pobj) == np.int_: return c_long(pobj) if type(pobj) == np.uint: return c_ulong(pobj) if type(pobj) == np.longlong: return c_longlong(pobj) if type(pobj) == np.ulonglong: return c_ulonglong(pobj) if type(pobj) == np.longdouble: return c_longdouble(pobj) if type(pobj) == str: cname = c_char_p(pobj) if sys.version_info[0] == 2 else c_char_p(bytes(pobj, 'ascii')) return cname return c_int(pobj) def convert_obj_ctype(pobj): if type(pobj) != np.ndarray and pobj == None: cobj = c_void_p(0) return cobj, sizeof(cobj) elif np.isscalar(pobj): cobj = get_scalar_conv(pobj) return cobj, sizeof(cobj) elif type(pobj) == size_t: return c_size_t(pobj.value), sizeof(c_size_t) elif type(pobj) == np.ndarray and pobj.size > 1 and type(pobj[0]) == np.str_: str_list = [] for s in str_list: c_str = c_char_p(s) if sys.version_info[0] == 2 else c_char_p(bytes(s, 'ascii')) s.append(c_str) s = np.array(str_list) cobj = s.ctypes.data_as(c_void_p) return cobj, s.nbytes elif type(pobj) == np.ndarray: cobj = pobj.ctypes.data_as(c_void_p) return cobj, pobj.nbytes elif type(pobj) == graph: return pobj.handle, sizeof(pobj.handle) elif type(pobj) == task: return pobj.handle, sizeof(pobj.handle) else: # Expecting user already defined its type c_host = pobj c_size = sizeof(pobj) return c_host, c_size class kernel_arg: def __init__(self, is_mem, size, value, mem_obj, mem_off, mem_size, off, mode): self.is_mem = is_mem self.size = size self.value = None if not is_mem: self.value = cast(value, ctypes.POINTER(ctypes.c_ubyte * self.size)) pmem = None if is_mem and mem_obj.class_obj not in mem_handle_2_pobj: dll.iris_mem_get_type.restype = c_int mem_type = dll.iris_mem_get_type(mem_obj) if mem_type == IRIS_DMEM_REGION: pmem = dmem_region.__new__(dmem_region) pmem.type = IRIS_DMEM_REGION elif mem_type == IRIS_DMEM: pmem = dmem.__new__(dmem) pmem.type = IRIS_DMEM else: pmem = mem.__new__(mem) pmem.type = IRIS_MEM pmem.handle = mem_obj mem_handle_2_pobj[mem_obj.class_obj] = pmem if pmem.type == IRIS_DMEM_REGION: pmem.fetch_dmem() self.mem = None if is_mem: self.mem = mem_handle_2_pobj[mem_obj.class_obj] self.mem_off = mem_off self.mem_size = mem_size self.off = off self.mode = mode def is_mode_r(self): return self.mode == iris_r def is_mode_rw(self): return self.mode == iris_rw def is_mode_w(self): return self.mode == iris_w def is_input(self): return self.is_mode_r() or self.is_mode_rw() def is_output(self): return self.is_mode_w() or self.is_mode_rw() def param_value(self, pack_type='f'): [ value ] = struct.unpack(pack_type, bytes(self.value.contents)) return value def display(self): print(self.is_mem, self.size, self.value, self.mem, self.mem_off, self.mem_size, self.off, self.mode) class cmd_kernel: def __init__(self, ckernel): self.ckernel = ckernel def name(self): dll.iris_kernel_get_name.restype = c_char_p kname = dll.iris_kernel_get_name(self.handle) return kname.decode('ascii') def get_kernel_nargs(self): dll.iris_cmd_kernel_get_nargs.restype = c_int return dll.iris_cmd_kernel_get_nargs(self.ckernel) def get_kernel_args(self): nargs = self.get_kernel_nargs() tasks = (POINTER(c_void_p) * nargs)() args = [] dll.iris_cmd_kernel_get_arg_mode.restype = c_int dll.iris_cmd_kernel_get_arg_size.restype = c_size_t dll.iris_cmd_kernel_get_arg_mem_off.restype = c_size_t dll.iris_cmd_kernel_get_arg_mem_size.restype = c_size_t dll.iris_cmd_kernel_get_arg_off.restype = c_size_t dll.iris_cmd_kernel_get_arg_mem.restype = iris_mem dll.iris_cmd_kernel_get_arg_value.restype = POINTER(c_ubyte) for i in range(nargs): is_mem = dll.iris_cmd_kernel_get_arg_is_mem(self.ckernel, i) mode = dll.iris_cmd_kernel_get_arg_mode(self.ckernel, i) size = dll.iris_cmd_kernel_get_arg_size(self.ckernel, i) mem_off = dll.iris_cmd_kernel_get_arg_mem_off(self.ckernel, i) mem_size = dll.iris_cmd_kernel_get_arg_mem_size(self.ckernel, i) off = dll.iris_cmd_kernel_get_arg_off(self.ckernel, i) mem_obj = dll.iris_cmd_kernel_get_arg_mem(self.ckernel, i) value = dll.iris_cmd_kernel_get_arg_value(self.ckernel, i) args.append(kernel_arg(is_mem, size, value, mem_obj, mem_off, mem_size, off, mode)) return nargs, args task_handle_2_pobj = {} class task: def __init__(self, *args): if len(args) == 0: self.params = [] self.handle = task_create() task_handle_2_pobj[self.handle.class_obj] = self return (kernel, dim, off, gws, lws, params_list) = args coff = (c_size_t * dim)(*off) cgws = (c_size_t * dim)(*gws) clws = (c_size_t * dim)(*lws) nparams = len(params_list) self.params = [] cparams = (c_void_p * nparams)() self.handle = task_create(kernel) task_handle_2_pobj[self.handle.class_obj] = self params_info = [] flush_objs = [] for i, pobj in enumerate(params_list): if type(pobj) == tuple: if hasattr(pobj[0], 'handle') and isinstance(pobj[0].handle, iris_mem): self.params.append(pobj[0]) cparams[i] = pobj[0].handle.class_obj params_info.append(pobj[1]) if len(pobj) == 3 and pobj[2] == iris_flush: flush_objs.append(cparams[i]) else: # (NUMPY Object, modes of use, <optional flush>) # Should be numpy array host = pobj[0] dobj = dmem(host) self.params.append(dobj) cparams[i] = dobj.handle.class_obj params_info.append(pobj[1]) if len(pobj) == 3 and pobj[2] == iris_flush: flush_objs.append(cparams[i]) elif pobj == None: p = c_void_p(0) self.params.append(p) cparams[i] = cast(p, c_void_p) params_info.append(sizeof(p)) elif np.isscalar(pobj): cobj = get_scalar_conv(pobj) p = byref(cobj) self.params.append(p) cparams[i] = cast(p, c_void_p) params_info.append(sizeof(cobj)) elif type(pobj) == size_t: cobj = c_size_t(pobj.value) p = byref(cobj) self.params.append(p) cparams[i] = cast(p, c_void_p) params_info.append(sizeof(cobj)) else: print("error") cparams_info = (c_int * nparams)(*params_info) kernel_name = c_char_p(kernel) if sys.version_info[0] == 2 else c_char_p(bytes(kernel, 'ascii')) dll.iris_task_kernel(self.handle, kernel_name, c_int(dim), coff, cgws, clws, c_int(nparams), cparams, cparams_info) for pobj in flush_objs: dll.iris_task_dmem_flush_out(self.handle, pobj); def name(self): dll.iris_task_get_name.restype = c_char_p kname = dll.iris_task_get_name(self.handle) return kname.decode('ascii') def set_order(self, order): dll.call(dll.iris_task_kernel_dmem_fetch_order, self.handle, order) def set_name(self, name): c_name = c_char_p(name) if sys.version_info[0] == 2 else c_char_p(bytes(name, 'ascii')) dll.iris_task_set_name(self.handle, c_name) def set_device(self, device): dll.iris_task_set_policy(self.handle, convert_obj_ctype(device)[0]) def uid(self): dll.iris_task_get_uid.restype = c_ulong return dll.iris_task_get_uid(self.handle) def get_kernel_args(self): ckernel = self.get_cmd_kernel() if ckernel == None: return 0, [] return ckernel.get_kernel_args() def get_input_mems(self): ckernel = self.get_cmd_kernel() if ckernel == None: return [] n_args, args = ckernel.get_kernel_args() inputs = [] for each_arg in args: if each_arg.mem != None and each_arg.is_input(): inputs.append(each_arg.mem) return inputs def get_output_mems(self): ckernel = self.get_cmd_kernel() if ckernel == None: return [] n_args, args = ckernel.get_kernel_args() outputs = [] for each_arg in args: if each_arg.mem != None and each_arg.is_output(): outputs.append(each_arg.mem) return outputs def get_cmd_kernel(self): dll.iris_task_is_cmd_kernel_exists.restype = c_int is_exists = dll.iris_task_is_cmd_kernel_exists(self.handle) if is_exists == 0: return None dll.iris_task_get_cmd_kernel.restype = POINTER(c_void_p) ckernel = dll.iris_task_get_cmd_kernel(self.handle) return cmd_kernel(ckernel) def get_kernel(self): dll.iris_task_get_kernel.restype = iris_kernel ckernel = dll.iris_task_get_kernel(self.handle) if ckernel.class_obj not in kernel_handle_2_pobj: pkernel = kernel.__new__(kernel) pkernel.handle = ckernel kernel_handle_2_pobj[ckernel.class_obj] = pkernel return kernel_handle_2_pobj[ckernel.class_obj] def get_ndepends(self): dll.iris_task_get_dependency_count.restype = c_int ntasks = dll.iris_task_get_dependency_count(self.handle) return ntasks def get_depends(self): dll.iris_task_get_dependency_count.restype = c_int ntasks = dll.iris_task_get_dependency_count(self.handle) tasks = (iris_task * ntasks)() dll.iris_task_get_dependencies.argtypes = [iris_task, POINTER(iris_task)] dll.iris_task_get_dependencies.restype = None dll.iris_task_get_dependencies(self.handle, tasks) ptasks = convert_ctasks(tasks) return ntasks, ptasks def depends(self, tasks_list): if type(tasks_list) != list: tasks_list = [ tasks_list ] dep_list = [t.handle for t in tasks_list] task_depend(self.handle, len(tasks_list), dep_list) def h2d(self, mem, off, size, host): task_h2d(self.handle, mem.handle, off, size, host) def params_map(self, params_map_list): task_params_map(self.handle, params_map_list) def d2h(self, mem, off, size, host): task_d2h(self.handle, mem.handle, off, size, host) def h2d_full(self, mem, host): task_h2d_full(self.handle, mem.handle, host) def d2h_full(self, mem, host): task_d2h_full(self.handle, mem.handle, host) def kernel(self, kernel, dim, off, gws, lws, params, params_info): task_kernel(self.handle, kernel, dim, off, gws, lws, params, params_info, self.params) def host(self, func, params): task_host(self.handle, func, params) def submit(self, device, sync = 1): task_submit(self.handle, device, sync) def task_name(self): dll.iris_task_get_name(self.handle) def kernel_name(self): dll.iris_task_get_kernel_name(self.handle) def kernel_params(self): dll.iris_task_get_kernel_params(self.handle) def inputs(self): dll.iris_task_get_inputs(self.handle) def outputs(self): dll.iris_task_get_inputs(self.handle) def convert_ctasks(c_tasks): ptasks = [] for c_task in c_tasks: if c_task.class_obj not in task_handle_2_pobj: p_task = task.__new__(task) p_task.params = [] p_task.handle = c_task task_handle_2_pobj[c_task.class_obj] = p_task ptasks.append(task_handle_2_pobj[c_task.class_obj]) return ptasks class graph: def __init__(self, tasks=[], device=iris_default, opt=None): self.handle = graph_create() for each_task in tasks: self.add_task(each_task, device, opt) def retain(self): dll.iris_graph_retain(self.handle) def release(self): dll.iris_graph_release(self.handle) def free(self): dll.iris_graph_free(self.handle) def wait(self): dll.iris_graph_wait(self.handle) def submit(self, device=iris_default, sync=1, order=None): if type(order) == np.ndarray or order != None: order_np = order if type(order) != np.ndarray: order_np = np.int32(order) conv_order_np = convert_obj_ctype(order_np) dll.iris_graph_submit_with_order(self.handle, conv_order_np[0], c_int(device), c_int(sync)) else: dll.iris_graph_submit(self.handle, c_int(device), c_int(sync)) def submit_time(self, device=iris_default, sync=1, order=None): #stime = np.double(0.0) #cobj = byref(c_double(stime)) stime = np.zeros((1), np.double) cobj = stime.ctypes.data_as(c_void_p) if type(order) == np.ndarray or order != None: order_np = order if type(order) != np.ndarray: order_np = np.int32(order) conv_order_np = convert_obj_ctype(order_np) dll.iris_graph_submit_with_order_and_time(self.handle, conv_order_np[0], cobj, c_int(device), c_int(sync)) else: dll.iris_graph_submit_with_time(self.handle, cobj, c_int(device), c_int(sync)) return stime[0] def add_task(self, task, device=iris_default, opt=None): c_opt = c_void_p(0) if opt != None: c_opt = c_char_p(opt) if sys.version_info[0] == 2 else c_char_p(bytes(opt, 'ascii')) dll.iris_graph_task(self.handle, task.handle, c_int(device), c_opt) def reset_memories(self): dll.iris_graph_reset_memories(self.handle) def enable_mem_profiling(self): dll.iris_graph_enable_mem_profiling(self.handle) def set_order(self, order): dll.call(dll.iris_graph_tasks_order, self.handle, order) def get_tasks(self): dll.iris_graph_tasks_count.restype = c_int ntasks = dll.iris_graph_tasks_count(self.handle) tasks = (iris_task * ntasks)() dll.iris_graph_get_tasks.argtypes = [iris_graph, POINTER(iris_task)] dll.iris_graph_get_tasks.restype = None dll.iris_graph_get_tasks(self.handle, tasks) ptasks = convert_ctasks(tasks) return ntasks, ptasks def get_task_names(self): ntasks, tasks = self.get_tasks() task_names = [ task.name() for task in tasks] task_names[0] = 'end_task' task_names.insert(0, 'start_task') return task_names def get_task_uids(self): ntasks, tasks = self.get_tasks() task_uids = [0] + [ task.uid() for task in tasks] return task_uids def tasks_execution_schedule(self, kernel_profile=False): # Order should not be changed ptr = dll.call_ret(dll.iris_get_graph_tasks_execution_schedule, POINTER(TaskProfile), self.handle, np.int32(kernel_profile)) total = dll.call_ret(dll.iris_get_graph_tasks_execution_schedule_count, c_size_t, self.handle) ptr_list = [ ptr[i] for i in range(total) ] return ptr_list def mems_execution_schedule(self): # Order should not be changed ptr = dll.call_ret(dll.iris_get_graph_dataobjects_execution_schedule, POINTER(DataObjectProfile), self.handle) total = dll.call_ret(dll.iris_get_graph_dataobjects_execution_schedule_count, c_size_t, self.handle) ptr_list = [ ptr[i] for i in range(total) ] return ptr_list def get_dependency_matrix(self, pdf=False): ntasks, tasks = self.get_tasks() SIZE = ntasks+1 dep_graph, dep_graph_2d_ptr = dll.alloc_int8((SIZE,SIZE)) dll.call_ret_ptr(dll.iris_get_graph_dependency_adj_matrix, self.handle, dep_graph) print("Dependency matrix", dep_graph) if pdf: task_names = self.get_task_names() task_uids = self.get_task_uids() import pandas as pd df = pd.DataFrame(dep_graph, columns=task_names) df['task_name'] = task_names df['task_uid'] = task_uids df['parent_count'] = dep_graph.sum(axis=1) df['child_count'] = dep_graph.sum(axis=0) return df return dep_graph def get_3d_cost_communication_matrix(self, pdf=False): ntasks, tasks = self.get_tasks() task_index_hash = {} task_input_mems = [] task_output_mems = [] mem_index_hash = {} mem_regions_2_dmem_hash = {} mem_dmem_2_regions = {} mem_serial_index_hash = {} for index, each_task in enumerate(tasks): task_index_hash[each_task.uid()] = index inputs = each_task.get_input_mems() outputs = each_task.get_output_mems() task_input_mems.append([]) task_output_mems.append([]) for inp in inputs: if inp.uid() not in mem_index_hash: mem_index_hash[inp.uid()] = {'mem': inp, 'inputs':[], 'outputs':[]} mem_index_hash[inp.uid()]['inputs'].append(each_task.uid()) task_input_mems[index].append(inp.uid()) if inp.uid() not in mem_serial_index_hash: mem_serial_index_hash[inp.uid()] = len(mem_serial_index_hash) for outp in outputs: if outp.uid() not in mem_index_hash: mem_index_hash[outp.uid()] = {'mem': outp, 'inputs':[], 'outputs':[]} mem_index_hash[outp.uid()]['outputs'].append(each_task.uid()) task_output_mems[index].append(outp.uid()) if outp.uid() not in mem_serial_index_hash: mem_serial_index_hash[outp.uid()] = len(mem_serial_index_hash) if outp.type == IRIS_DMEM_REGION: if outp.uid() not in mem_regions_2_dmem_hash: mem_regions_2_dmem_hash[outp.uid()] = outp.mem.uid() if outp.mem.uid() not in mem_dmem_2_regions: mem_dmem_2_regions[outp.mem.uid()] = [] mem_dmem_2_regions[outp.mem.uid()].append(outp.uid()) task_outputs = np.zeros(len(tasks)) for index, each_task in enumerate(tasks): n_depends, dep_tasks = each_task.get_depends() for d_task in dep_tasks: d_index = task_index_hash[d_task.uid()] task_outputs[d_index] += 1 dep_graph = np.zeros((len(tasks)+1, len(tasks)+1, len(mem_index_hash)), np.int64) for index, each_task in enumerate(tasks): lst1 = task_input_mems[index] n_depends, dep_tasks = each_task.get_depends() if index == 0: # End Task dep_tasks = [] for d_index, n_outs in enumerate(task_outputs): if n_outs == 1: dep_tasks.append(tasks[d_index]) added_mems = [] for d_task in dep_tasks: d_index = task_index_hash[d_task.uid()] lst2 = task_output_mems[d_index] mem_list = lst2 if len(lst1) > 0: mem_list = list(set(lst1) & set(lst2)) added_mems = mem_list for mem_index in mem_list: size = mem_index_hash[mem_index]['mem'].size() mem_serial_index = mem_serial_index_hash[mem_index] dep_graph[index+1][d_index+1][mem_serial_index] += size for mem_index in lst2: if mem_index in mem_regions_2_dmem_hash: dmem_index = mem_regions_2_dmem_hash[mem_index] if dmem_index in lst1: size = mem_index_hash[mem_index]['mem'].size() mem_serial_index = mem_serial_index_hash[mem_index] dep_graph[index+1][d_index+1][mem_serial_index] += size added_mems.append(dmem_index) for mem_index in lst1: if mem_index not in added_mems: mem_obj = mem_index_hash[mem_index]['mem'] if mem_index in mem_regions_2_dmem_hash: dmem_index = mem_regions_2_dmem_hash[mem_index] dmem_obj = mem_index_hash[dmem_index]['mem'] if not dmem_obj.is_reset(): size = mem_obj.size() mem_serial_index = mem_serial_index_hash[mem_index] dep_graph[index+1][0][mem_serial_index] += size elif not mem_obj.is_reset(): size = mem_obj.size() mem_serial_index = mem_serial_index_hash[mem_index] dep_graph[index+1][0][mem_serial_index] += size return dep_graph def get_2d_cost_communication_matrix(self, pdf=False): ntasks, tasks = self.get_tasks() SIZE = ntasks+1 comm_2d, comm_2d_ptr = dll.alloc_size_t((SIZE,SIZE)) dll.call_ret_ptr(dll.iris_get_graph_2d_comm_adj_matrix, self.handle, comm_2d) print("Communication cost matrix", comm_2d) if pdf: task_names = self.get_task_names() task_uids = self.get_task_uids() import pandas as pd df = pd.DataFrame(comm_2d, columns=task_names) df['task_name'] = task_names df['task_uid'] = task_uids df['parent_count'] = comm_2d.sum(axis=1) df['child_count'] = comm_2d.sum(axis=0) return df return comm_2d def get_3d_cost_comm_time(self, iterations=1, pin_memory=True): print("Extracting 3d communication cost for each object") n_mems = dll.call_ret(dll.iris_count_mems, c_size_t, self.handle) ndevs = device_count()+1 time_data = np.zeros(n_mems*ndevs*ndevs, np.double) mem_ids = np.zeros(n_mems, np.int32) dll.call(dll.iris_get_graph_3d_comm_time, self.handle, time_data, mem_ids, np.int32(iterations), np.int32(pin_memory)) return time_data.reshape((n_mems, ndevs, ndevs)), mem_ids def get_3d_cost_comm_data(self): print("Extracting communication data 3d list of (task from id, task to id, mem_id, size)") dll.call(dll.iris_get_graph_3d_comm_data, self.handle) total = dll.call_ret(dll.iris_get_graph_3d_comm_data_size, c_size_t, self.handle) ptr = dll.call_ret(dll.iris_get_graph_3d_comm_data_ptr, POINTER(CommData3D), self.handle) ptr_list = [ ptr[i] for i in range(total) ] return ptr_list def calibrate_compute_cost_adj_matrix(self, pdf=False, only_device_type=False): print("Calibrating computation cost for each task and for each processor type") ntasks, tasks = self.get_tasks() ndevs = device_count() dev_names = all_device_info() SIZE = ntasks+1 cols = ndevs col_names = dev_names if only_device_type: cols = platform_count() col_names = all_platform_info() comp_2d, comp_2d_ptr = dll.alloc_double((SIZE, cols)) if only_device_type: dll.call_ret_ptr(dll.iris_calibrate_compute_cost_adj_matrix_only_for_types, self.handle, comp_2d) else: dll.call_ret_ptr(dll.iris_calibrate_compute_cost_adj_matrix, self.handle, comp_2d) print("Computation cost matrix", comp_2d) if pdf: task_names = self.get_task_names() task_uids = self.get_task_uids() import pandas as pd df = pd.DataFrame(comp_2d, columns=col_names) df['task_name'] = task_names df['task_uid'] = task_uids return df return comp_2d