diff options
Diffstat (limited to 'src/silx/opencl/common.py')
-rw-r--r-- | src/silx/opencl/common.py | 820 |
1 files changed, 820 insertions, 0 deletions
diff --git a/src/silx/opencl/common.py b/src/silx/opencl/common.py new file mode 100644 index 0000000..30c9ef7 --- /dev/null +++ b/src/silx/opencl/common.py @@ -0,0 +1,820 @@ +#!/usr/bin/env python +# +# Project: S I L X project +# https://github.com/silx-kit/silx +# +# Copyright (C) 2012-2023 European Synchrotron Radiation Facility, Grenoble, France +# +# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu) +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +__author__ = "Jerome Kieffer" +__contact__ = "Jerome.Kieffer@ESRF.eu" +__license__ = "MIT" +__copyright__ = "2012-2017 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "09/09/2023" +__status__ = "stable" +__all__ = [ + "ocl", + "pyopencl", + "mf", + "release_cl_buffers", + "allocate_cl_buffers", + "measure_workgroup_size", + "kernel_workgroup_size", +] + +import os +import logging + +import numpy + +from .utils import get_opencl_code + +logger = logging.getLogger(__name__) + +if os.environ.get("SILX_OPENCL") in ["0", "False"]: + logger.info( + "Use of OpenCL has been disabled from environment variable: SILX_OPENCL=0" + ) + pyopencl = None +else: + try: + import pyopencl + except ImportError: + logger.warning( + "Unable to import pyOpenCl. Please install it from: https://pypi.org/project/pyopencl" + ) + pyopencl = None + else: + try: + pyopencl.get_platforms() + except pyopencl.LogicError: + logger.warning( + "The module pyOpenCL has been imported but can't be used here" + ) + pyopencl = None + +if pyopencl is not None: + import pyopencl.array as array + + mf = pyopencl.mem_flags + from .atomic import check_atomic32, check_atomic64 +else: + # Define default mem flags + class mf(object): + WRITE_ONLY = 1 + READ_ONLY = 1 + READ_WRITE = 1 + + +FLOP_PER_CORE = { + "GPU": 64, # GPU, Fermi at least perform 64 flops per cycle/multicore, G80 were at 24 or 48 ... + "CPU": 4, # CPU, at least intel's have 4 operation per cycle + "ACC": 8, +} # ACC: the Xeon-phi (MIC) appears to be able to process 8 Flops per hyperthreaded-core + +# Sources : https://en.wikipedia.org/wiki/CUDA +NVIDIA_FLOP_PER_CORE = { + (1, 0): 24, # Guessed ! + (1, 1): 24, # Measured on G98 [Quadro NVS 295] + (1, 2): 24, # Guessed ! + (1, 3): 24, # measured on a GT285 (GT200) + (2, 0): 64, # Measured on a 580 (GF110) + (2, 1): 96, # Measured on Quadro2000 GF106GL + (3, 0): 384, # Guessed! + (3, 5): 384, # Measured on K20 + (3, 7): 384, # K80: Guessed! + (5, 0): 256, # Maxwell 4 warps/SM 2 flops/ CU + (5, 2): 256, # Titan-X + (5, 3): 256, # TX1 + (6, 0): 128, # GP100 + (6, 1): 128, # GP104 + (6, 2): 128, # ? + (7, 0): 128, # Volta # measured on Telsa V100 + (7, 2): 128, # Volta ? + (7, 5): 128, # Turing # measured on RTX 6000 + (8, 0): 128, # Ampere # measured on Tesla A100 + (8, 6): 256, # Ampere # measured on RTX A5000 +} + +AMD_FLOP_PER_CORE = 160 # Measured on a M7820 10 core, 700MHz 1120GFlops + + +class Device(object): + """ + Simple class that contains the structure of an OpenCL device + """ + + def __init__( + self, + name="None", + dtype=None, + version=None, + driver_version=None, + extensions="", + memory=None, + available=None, + cores=None, + frequency=None, + flop_core=None, + idx=0, + workgroup=1, + atomic32=None, + atomic64=None, + platform=None, + ): + """ + Simple container with some important data for the OpenCL device description. + + :param name: name of the device + :param dtype: device type: CPU/GPU/ACC... + :param version: driver version + :param driver_version: + :param extensions: List of opencl extensions + :param memory: maximum memory available on the device + :param available: is the device deactivated or not + :param cores: number of SM/cores + :param frequency: frequency of the device + :param flop_core: Flopating Point operation per core per cycle + :param idx: index of the device within the platform + :param workgroup: max workgroup size + :param platform: the platform to which this device is attached + """ + self.name = name.strip() + self.type = dtype + self.version = version + self.driver_version = driver_version + self.extensions = extensions.split() + self.memory = memory + self.available = available + self.cores = cores + self.frequency = frequency + self.id = idx + self.max_work_group_size = workgroup + self.atomic32 = atomic32 + self.atomic64 = atomic64 + if not flop_core: + flop_core = FLOP_PER_CORE.get(dtype, 1) + if cores and frequency: + self.flops = cores * frequency * flop_core + else: + self.flops = flop_core + self.platform = platform + + def __repr__(self): + return "%s" % self.name + + def pretty_print(self): + """ + Complete device description + + :return: string + """ + lst = [ + "Name\t\t:\t%s" % self.name, + "Type\t\t:\t%s" % self.type, + "Memory\t\t:\t%.3f MB" % (self.memory / 2.0**20), + "Cores\t\t:\t%s CU" % self.cores, + "Frequency\t:\t%s MHz" % self.frequency, + "Speed\t\t:\t%.3f GFLOPS" % (self.flops / 1000.0), + "Version\t\t:\t%s" % self.version, + "Available\t:\t%s" % self.available, + ] + return os.linesep.join(lst) + + def set_unavailable(self): + """Use this method to flag a faulty device""" + self.available = False + + +class Platform(object): + """ + Simple class that contains the structure of an OpenCL platform + """ + + def __init__( + self, name="None", vendor="None", version=None, extensions=None, idx=0 + ): + """ + Class containing all descriptions of a platform and all devices description within that platform. + + :param name: platform name + :param vendor: name of the brand/vendor + :param version: + :param extensions: list of the extension provided by the platform to all of its devices + :param idx: index of the platform + """ + self.name = name.strip() + self.vendor = vendor.strip() + self.version = version + self.extensions = extensions.split() + self.devices = [] + self.id = idx + + def __repr__(self): + return "%s" % self.name + + def add_device(self, device): + """ + Add new device to the platform + + :param device: Device instance + """ + device.platform = self + self.devices.append(device) + + def get_device(self, key): + """ + Return a device according to key + + :param key: identifier for a device, either it's id (int) or it's name + :type key: int or str + """ + out = None + try: + devid = int(key) + except ValueError: + for a_dev in self.devices: + if a_dev.name == key: + out = a_dev + else: + if len(self.devices) > devid > 0: + out = self.devices[devid] + return out + + +def _measure_workgroup_size(device_or_context, fast=False): + """Mesure the maximal work group size of the given device + + DEPRECATED since not perfectly correct ! + + :param device_or_context: instance of pyopencl.Device or pyopencl.Context + or 2-tuple (platformid,deviceid) + :param fast: ask the kernel the valid value, don't probe it + :return: maximum size for the workgroup + """ + if isinstance(device_or_context, pyopencl.Device): + try: + ctx = pyopencl.Context(devices=[device_or_context]) + except pyopencl._cl.LogicError as error: + platform = device_or_context.platform + platformid = pyopencl.get_platforms().index(platform) + deviceid = platform.get_devices().index(device_or_context) + ocl.platforms[platformid].devices[deviceid].set_unavailable() + raise RuntimeError( + "Unable to create context on %s/%s: %s" + % (platform, device_or_context, error) + ) + else: + device = device_or_context + elif isinstance(device_or_context, pyopencl.Context): + ctx = device_or_context + device = device_or_context.devices[0] + elif isinstance(device_or_context, (tuple, list)) and len(device_or_context) == 2: + ctx = ocl.create_context( + platformid=device_or_context[0], deviceid=device_or_context[1] + ) + device = ctx.devices[0] + else: + raise RuntimeError( + """given parameter device_or_context is not an + instanciation of a device or a context""" + ) + shape = device.max_work_group_size + # get the context + + assert ctx is not None + queue = pyopencl.CommandQueue(ctx) + + max_valid_wg = 1 + data = numpy.random.random(shape).astype(numpy.float32) + d_data = pyopencl.array.to_device(queue, data) + d_data_1 = pyopencl.array.empty_like(d_data) + d_data_1.fill(numpy.float32(1.0)) + + program = pyopencl.Program(ctx, get_opencl_code("addition")).build() + if fast: + max_valid_wg = program.addition.get_work_group_info( + pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, device + ) + else: + maxi = int(round(numpy.log2(shape))) + for i in range(maxi + 1): + d_res = pyopencl.array.empty_like(d_data) + wg = 1 << i + try: + evt = program.addition( + queue, + (shape,), + (wg,), + d_data.data, + d_data_1.data, + d_res.data, + numpy.int32(shape), + ) + evt.wait() + except Exception as error: + logger.info( + "%s on device %s for WG=%s/%s", error, device.name, wg, shape + ) + program = queue = d_res = d_data_1 = d_data = None + break + else: + res = d_res.get() + good = numpy.allclose(res, data + 1) + if good: + if wg > max_valid_wg: + max_valid_wg = wg + else: + logger.warning( + "ArithmeticError on %s for WG=%s/%s", wg, device.name, shape + ) + + return max_valid_wg + + +def _is_nvidia_gpu(vendor, devtype): + return (vendor == "NVIDIA Corporation") and (devtype == "GPU") + + +class OpenCL(object): + """ + Simple class that wraps the structure ocl_tools_extended.h + + This is a static class. + ocl should be the only instance and shared among all python modules. + """ + + platforms = [] + nb_devices = 0 + context_cache = {} # key: 2-tuple of int, value: context + if pyopencl: + platform = device = pypl = devtype = extensions = pydev = None + for idx, platform in enumerate(pyopencl.get_platforms()): + pypl = Platform( + platform.name, + platform.vendor, + platform.version, + platform.extensions, + idx, + ) + for idd, device in enumerate(platform.get_devices()): + #################################################### + # Nvidia does not report int64 atomics (we are using) ... + # this is a hack around as any nvidia GPU with double-precision supports int64 atomics + #################################################### + extensions = device.extensions + if (pypl.vendor == "NVIDIA Corporation") and ( + "cl_khr_fp64" in extensions + ): + extensions += ( + " cl_khr_int64_base_atomics cl_khr_int64_extended_atomics" + ) + try: + devtype = pyopencl.device_type.to_string(device.type).upper() + except ValueError: + # pocl does not describe itself as a CPU ! + devtype = "CPU" + if len(devtype) > 3: + if "GPU" in devtype: + devtype = "GPU" + elif "ACC" in devtype: + devtype = "ACC" + elif "CPU" in devtype: + devtype = "CPU" + else: + devtype = devtype[:3] + if _is_nvidia_gpu(device.vendor, devtype) and ( + "compute_capability_major_nv" in dir(device) + ): + try: + comput_cap = ( + device.compute_capability_major_nv, + device.compute_capability_minor_nv, + ) + except pyopencl.LogicError: + flop_core = FLOP_PER_CORE["GPU"] + else: + flop_core = NVIDIA_FLOP_PER_CORE.get( + comput_cap, FLOP_PER_CORE["GPU"] + ) + elif (pypl.vendor == "Advanced Micro Devices, Inc.") and ( + devtype == "GPU" + ): + flop_core = AMD_FLOP_PER_CORE + elif devtype == "CPU": + flop_core = FLOP_PER_CORE.get(devtype, 1) + else: + flop_core = 1 + workgroup = device.max_work_group_size + if (devtype == "CPU") and (pypl.vendor == "Apple"): + logger.info( + "For Apple's OpenCL on CPU: Measuring actual valid max_work_goup_size." + ) + workgroup = _measure_workgroup_size(device, fast=True) + if (devtype == "GPU") and os.environ.get("GPU") == "False": + # Environment variable to disable GPU devices + continue + pydev = Device( + device.name, + devtype, + device.version, + device.driver_version, + extensions, + device.global_mem_size, + bool(device.available), + device.max_compute_units, + device.max_clock_frequency, + flop_core, + idd, + workgroup, + check_atomic32(device)[0], + check_atomic64(device)[0], + ) + pypl.add_device(pydev) + nb_devices += 1 + platforms.append(pypl) + del platform, device, pypl, devtype, extensions, pydev + + def __repr__(self): + out = ["OpenCL devices:"] + for platformid, platform in enumerate(self.platforms): + deviceids = [ + f"({platformid},{deviceid}) {dev.name}" + for deviceid, dev in enumerate(platform.devices) + ] + out.append(f"[{platformid}] {platform.name}: " + ", ".join(deviceids)) + return os.linesep.join(out) + + def get_platform(self, key): + """ + Return a platform according + + :param key: identifier for a platform, either an Id (int) or it's name + :type key: int or str + """ + out = None + try: + platid = int(key) + except ValueError: + for a_plat in self.platforms: + if a_plat.name == key: + out = a_plat + else: + if len(self.platforms) > platid > 0: + out = self.platforms[platid] + return out + + def select_device( + self, dtype="ALL", memory=None, extensions=None, best=True, **kwargs + ): + """ + Select a device based on few parameters (at the end, keep the one with most memory) + + :param dtype: "gpu" or "cpu" or "all" .... + :param memory: minimum amount of memory (int) + :param extensions: list of extensions to be present + :param best: shall we look for the + :returns: A tuple of plateform ID and device ID, else None if nothing + found + """ + if extensions is None: + extensions = [] + if "type" in kwargs: + dtype = kwargs["type"].upper() + else: + dtype = dtype.upper() + if len(dtype) > 3: + dtype = dtype[:3] + best_found = None + for platformid, platform in enumerate(self.platforms): + for deviceid, device in enumerate(platform.devices): + if not device.available: + continue + if (dtype in ["ALL", "DEF"]) or (device.type == dtype): + if (memory is None) or (memory <= device.memory): + found = True + for ext in extensions: + if ext not in device.extensions: + found = False + if found: + if not best: + return platformid, deviceid + else: + if not best_found: + best_found = platformid, deviceid, device.flops + elif best_found[2] < device.flops: + best_found = platformid, deviceid, device.flops + if best_found: + return best_found[0], best_found[1] + + # Nothing found + return None + + def create_context( + self, + devicetype="ALL", + platformid=None, + deviceid=None, + cached=True, + memory=None, + extensions=None, + ): + """ + Choose a device and initiate a context. + + Devicetypes can be GPU,gpu,CPU,cpu,DEF,ACC,ALL. + Suggested are GPU,CPU. + For each setting to work there must be such an OpenCL device and properly installed. + E.g.: If Nvidia driver is installed, GPU will succeed but CPU will fail. + The AMD SDK kit is required for CPU via OpenCL. + :param devicetype: string in ["cpu","gpu", "all", "acc"] + :param platformid: integer + :param deviceid: integer + :param cached: True if we want to cache the context + :param memory: minimum amount of memory of the device + :param extensions: list of extensions to be present + :return: OpenCL context on the selected device + """ + if extensions is None: + extensions = [] + + ctx = None + if (platformid is not None) and (deviceid is not None): + platformid = int(platformid) + deviceid = int(deviceid) + elif "PYOPENCL_CTX" in os.environ: + ctx = pyopencl.create_some_context() + # try: + device = ctx.devices[0] + platforms = [ + i for i, p in enumerate(ocl.platforms) if device.platform.name == p.name + ] + if platforms: + platformid = platforms[0] + devices = [ + i + for i, d in enumerate(ocl.platforms[platformid].devices) + if device.name == d.name + ] + if devices: + deviceid = devices[0] + if cached: + self.context_cache[(platformid, deviceid)] = ctx + else: + ids = ocl.select_device(type=devicetype, extensions=extensions) + if ids: + platformid, deviceid = ids + + if (ctx is None) and (platformid is not None) and (deviceid is not None): + if (platformid, deviceid) in self.context_cache: + ctx = self.context_cache[(platformid, deviceid)] + else: + try: + ctx = pyopencl.Context( + devices=[ + pyopencl.get_platforms()[platformid].get_devices()[deviceid] + ] + ) + except pyopencl._cl.LogicError as error: + self.platforms[platformid].devices[deviceid].set_unavailable() + logger.warning( + "Unable to create context on %s/%s: %s", + platformid, + deviceid, + error, + ) + ctx = None + else: + if cached: + self.context_cache[(platformid, deviceid)] = ctx + if ctx is None: + logger.warning( + "Last chance to get an OpenCL device ... probably not the one requested" + ) + ctx = pyopencl.create_some_context(interactive=False) + return ctx + + def device_from_context(self, context): + """ + Retrieves the Device from the context + + :param context: OpenCL context + :return: instance of Device + """ + odevice = context.devices[0] + oplat = odevice.platform + device_id = oplat.get_devices().index(odevice) + platform_id = pyopencl.get_platforms().index(oplat) + return self.platforms[platform_id].devices[device_id] + + +if pyopencl: + ocl = OpenCL() + if ocl.nb_devices == 0: + ocl = None +else: + ocl = None + + +def release_cl_buffers(cl_buffers): + """ + :param cl_buffers: the buffer you want to release + :type cl_buffers: dict(str, pyopencl.Buffer) + + This method release the memory of the buffers store in the dict + """ + for key, buffer_ in cl_buffers.items(): + if buffer_ is not None: + if isinstance(buffer_, pyopencl.array.Array): + try: + buffer_.data.release() + except pyopencl.LogicError: + logger.error("Error while freeing buffer %s", key) + else: + try: + buffer_.release() + except pyopencl.LogicError: + logger.error("Error while freeing buffer %s", key) + cl_buffers[key] = None + return cl_buffers + + +def allocate_cl_buffers(buffers, device=None, context=None): + """ + :param buffers: the buffers info use to create the pyopencl.Buffer + :type buffers: list(std, flag, numpy.dtype, int) + :param device: one of the context device + :param context: opencl contextdevice + :return: a dict containing the instanciated pyopencl.Buffer + :rtype: dict(str, pyopencl.Buffer) + + This method instanciate the pyopencl.Buffer from the buffers + description. + """ + mem = {} + if device is None: + device = ocl.device_from_context(context) + + # check if enough memory is available on the device + ualloc = 0 + for _, _, dtype, size in buffers: + ualloc += numpy.dtype(dtype).itemsize * size + memory = device.memory + logger.info( + "%.3fMB are needed on device which has %.3fMB", ualloc / 1.0e6, memory / 1.0e6 + ) + if ualloc >= memory: + memError = "Fatal error in allocate_buffers." + memError += "Not enough device memory for buffers" + memError += "(%lu requested, %lu available)" % (ualloc, memory) + raise MemoryError(memError) # noqa + + # do the allocation + try: + for name, flag, dtype, size in buffers: + mem[name] = pyopencl.Buffer( + context, flag, numpy.dtype(dtype).itemsize * size + ) + except pyopencl.MemoryError as error: + release_cl_buffers(mem) + raise MemoryError(error) + + return mem + + +def allocate_texture(ctx, shape, hostbuf=None, support_1D=False): + """ + Allocate an OpenCL image ("texture"). + + :param ctx: OpenCL context + :param shape: Shape of the image. Note that pyopencl and OpenCL < 1.2 + do not support 1D images, so 1D images are handled as 2D with one row + :param support_1D: force the image to be 1D if the shape has only one dim + """ + if len(shape) == 1 and not (support_1D): + shape = (1,) + shape + return pyopencl.Image( + ctx, + pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.USE_HOST_PTR, + pyopencl.ImageFormat( + pyopencl.channel_order.INTENSITY, pyopencl.channel_type.FLOAT + ), + hostbuf=numpy.zeros(shape[::-1], dtype=numpy.float32), + ) + + +def check_textures_availability(ctx): + """ + Check whether textures are supported on the current OpenCL context. + + :param ctx: OpenCL context + """ + try: + dummy_texture = allocate_texture(ctx, (16, 16)) + # Need to further access some attributes (pocl) + dummy_height = dummy_texture.height + textures_available = True + del dummy_texture, dummy_height + except (pyopencl.RuntimeError, pyopencl.LogicError): + textures_available = False + # Nvidia Fermi GPUs (compute capability 2.X) do not support opencl read_imagef + # There is no way to detect this until a kernel is compiled + try: + cc = ctx.devices[0].compute_capability_major_nv + textures_available &= cc >= 3 + except (pyopencl.LogicError, AttributeError): # probably not a Nvidia GPU + pass + # + return textures_available + + +def measure_workgroup_size(device): + """Measure the actual size of the workgroup + + :param device: device or context or 2-tuple with indexes + :return: the actual measured workgroup size + + if device is "all", returns a dict with all devices with their ids as keys. + """ + if (ocl is None) or (device is None): + return None + + if isinstance(device, tuple) and (len(device) == 2): + # this is probably a tuple (platformid, deviceid) + device = ocl.create_context(platformid=device[0], deviceid=device[1]) + + if device == "all": + res = {} + for pid, platform in enumerate(ocl.platforms): + for did, _devices in enumerate(platform.devices): + tup = (pid, did) + res[tup] = measure_workgroup_size(tup) + else: + res = _measure_workgroup_size(device) + return res + + +def query_kernel_info(program, kernel, what="WORK_GROUP_SIZE"): + """Extract the compile time information from a kernel + + :param program: OpenCL program + :param kernel: kernel or name of the kernel + :param what: what is the query about ? + :return: int or 3-int for the workgroup size. + + Possible information available are: + * 'COMPILE_WORK_GROUP_SIZE': Returns the work-group size specified inside the kernel (__attribute__((reqd_work_gr oup_size(X, Y, Z)))) + * 'GLOBAL_WORK_SIZE': maximum global size that can be used to execute a kernel #OCL2.1! + * 'LOCAL_MEM_SIZE': amount of local memory in bytes being used by the kernel + * 'PREFERRED_WORK_GROUP_SIZE_MULTIPLE': preferred multiple of workgroup size for launch. This is a performance hint. + * 'PRIVATE_MEM_SIZE' Returns the minimum amount of private memory, in bytes, used by each workitem in the kernel + * 'WORK_GROUP_SIZE': maximum work-group size that can be used to execute a kernel on a specific device given by device + + Further information on: + https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html + + """ + assert isinstance(program, pyopencl.Program) + if not isinstance(kernel, pyopencl.Kernel): + kernel_name = kernel + assert kernel in ( + k.function_name for k in program.all_kernels() + ), "the kernel exists" + kernel = program.__getattr__(kernel_name) + + device = program.devices[0] + query_wg = getattr(pyopencl.kernel_work_group_info, what) + return kernel.get_work_group_info(query_wg, device) + + +def kernel_workgroup_size(program, kernel): + """Extract the compile time maximum workgroup size + + :param program: OpenCL program + :param kernel: kernel or name of the kernel + :return: the maximum acceptable workgroup size for the given kernel + """ + return query_kernel_info(program, kernel, what="WORK_GROUP_SIZE") |