diff options
Diffstat (limited to 'src/silx/opencl')
34 files changed, 8069 insertions, 0 deletions
diff --git a/src/silx/opencl/__init__.py b/src/silx/opencl/__init__.py new file mode 100644 index 0000000..fbd1f88 --- /dev/null +++ b/src/silx/opencl/__init__.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Project: S I L X project +# https://github.com/silx-kit/silx +# +# Copyright (C) 2012-2018 European Synchrotron Radiation Facility, Grenoble, France +# +# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu) +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +"""This package provides OpenCl-based optimized processing functions. + +For more processing functions, see the silx.math and silx.image packages. + +See silx documentation: http://www.silx.org/doc/silx/latest/ +""" + +__author__ = "Jerome Kieffer" +__contact__ = "Jerome.Kieffer@ESRF.eu" +__license__ = "MIT" +__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "15/03/2017" +__status__ = "stable" + +import logging + + +logger = logging.getLogger(__name__) + + +from .common import * diff --git a/src/silx/opencl/backprojection.py b/src/silx/opencl/backprojection.py new file mode 100644 index 0000000..65a9836 --- /dev/null +++ b/src/silx/opencl/backprojection.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2016 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Module for (filtered) backprojection on the GPU""" + +from __future__ import absolute_import, print_function, with_statement, division + +__authors__ = ["A. Mirone, P. Paleo"] +__license__ = "MIT" +__date__ = "25/01/2019" + +import logging +import numpy as np + +from .common import pyopencl +from .processing import EventDescription, OpenclProcessing, BufferDescription +from .sinofilter import SinoFilter +from .sinofilter import fourier_filter as fourier_filter_ +from ..utils.deprecation import deprecated + +if pyopencl: + mf = pyopencl.mem_flags + import pyopencl.array as parray +else: + raise ImportError("Please install pyopencl in order to use opencl backprojection") +logger = logging.getLogger(__name__) + + +def _sizeof(Type): + """ + return the size (in bytes) of a scalar type, like the C behavior + """ + return np.dtype(Type).itemsize + + +def _idivup(a, b): + """ + return the integer division, plus one if `a` is not a multiple of `b` + """ + return (a + (b - 1)) // b + + +class Backprojection(OpenclProcessing): + """A class for performing the backprojection using OpenCL""" + kernel_files = ["backproj.cl", "array_utils.cl"] + + def __init__(self, sino_shape, slice_shape=None, axis_position=None, + angles=None, filter_name=None, ctx=None, devicetype="all", + platformid=None, deviceid=None, profile=False, + extra_options=None): + """Constructor of the OpenCL (filtered) backprojection + + :param sino_shape: shape of the sinogram. The sinogram is in the format + (n_b, n_a) where n_b is the number of detector bins + and n_a is the number of angles. + :param slice_shape: Optional, shape of the reconstructed slice. By + default, it is a square slice where the dimension + is the "x dimension" of the sinogram (number of + bins). + :param axis_position: Optional, axis position. Default is + `(shape[1]-1)/2.0`. + :param angles: Optional, a list of custom angles in radian. + :param filter_name: Optional, name of the filter for FBP. Default is + the Ram-Lak filter. + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by + clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param profile: switch on profiling to be able to profile at the kernel + level, store profiling elements (makes code slightly + slower) + :param extra_options: Advanced extra options in the form of a dict. + Current options are: cutoff, use_numpy_fft + """ + # OS X enforces a workgroup size of 1 when the kernel has + # synchronization barriers if sys.platform.startswith('darwin'): + # assuming no discrete GPU + # raise NotImplementedError("Backprojection is not implemented on CPU for OS X yet") + + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + profile=profile) + + self._init_geometry(sino_shape, slice_shape, angles, axis_position, + extra_options) + self._allocate_memory() + self._compute_angles() + self._init_kernels() + self._init_filter(filter_name) + + def _init_geometry(self, sino_shape, slice_shape, angles, axis_position, + extra_options): + """Geometry Initialization + + :param sino_shape: shape of the sinogram. The sinogram is in the format + (n_b, n_a) where n_b is the number of detector bins + and n_a is the number of angles. + :param slice_shape: shape of the reconstructed slice. By + default, it is a square slice where the dimension + is the "x dimension" of the sinogram (number of + bins). + :param angles: list of projection angles in radian. + :param axis_position: axis position + :param dict extra_options: Advanced extra options + """ + self.shape = sino_shape + self.num_bins = np.int32(sino_shape[1]) + self.num_projs = np.int32(sino_shape[0]) + self.angles = angles + if slice_shape is None: + self.slice_shape = (self.num_bins, self.num_bins) + else: + self.slice_shape = slice_shape + self.dimrec_shape = ( + _idivup(self.slice_shape[0], 32) * 32, + _idivup(self.slice_shape[1], 32) * 32 + ) + if axis_position: + self.axis_pos = np.float32(axis_position) + else: + self.axis_pos = np.float32((sino_shape[1] - 1.) / 2) + self.axis_array = None # TODO: add axis correction front-end + self._init_extra_options(extra_options) + + def _init_extra_options(self, extra_options): + """Backprojection extra option initialization + + :param dict extra_options: Advanced extra options + """ + self.extra_options = { + "cutoff": 1., + "use_numpy_fft": False, + # It is axis_pos - (num_bins-1)/2 in PyHST + "gpu_offset_x": 0., #self.axis_pos - (self.num_bins - 1) / 2., + "gpu_offset_y": 0., #self.axis_pos - (self.num_bins - 1) / 2. + } + if extra_options is not None: + self.extra_options.update(extra_options) + + def _allocate_memory(self): + # Host memory + self.slice = np.zeros(self.dimrec_shape, dtype=np.float32) + self._use_textures = self.check_textures_availability() + + # Device memory + self.buffers = [ + BufferDescription("_d_slice", self.dimrec_shape, np.float32, mf.READ_WRITE), + BufferDescription("d_sino", self.shape, np.float32, mf.READ_WRITE), # before transferring to texture (if available) + BufferDescription("d_cos", (self.num_projs,), np.float32, mf.READ_ONLY), + BufferDescription("d_sin", (self.num_projs,), np.float32, mf.READ_ONLY), + BufferDescription("d_axes", (self.num_projs,), np.float32, mf.READ_ONLY), + ] + self.allocate_buffers(use_array=True) + self.d_sino = self.cl_mem["d_sino"] # shorthand + + # Texture memory (if relevant) + if self._use_textures: + self._allocate_textures() + + # Local memory + self.local_mem = 256 * 3 * _sizeof(np.float32) # constant for all image sizes + + def _compute_angles(self): + if self.angles is None: + self.angles = np.linspace(0, np.pi, self.num_projs, False) + h_cos = np.cos(self.angles).astype(np.float32) + h_sin = np.sin(self.angles).astype(np.float32) + self.cl_mem["d_cos"][:] = h_cos[:] + self.cl_mem["d_sin"][:] = h_sin[:] + if self.axis_array: + self.cl_mem["d_axes"][:] = self.axis_array.astype(np.float32)[:] + else: + self.cl_mem["d_axes"][:] = np.ones(self.num_projs, dtype="f") * self.axis_pos + + def _init_kernels(self): + compile_options = None + if not(self._use_textures): + compile_options = "-DDONT_USE_TEXTURES" + OpenclProcessing.compile_kernels( + self, + self.kernel_files, + compile_options=compile_options + ) + # check that workgroup can actually be (16, 16) + self.compiletime_workgroup_size = self.kernels.max_workgroup_size("backproj_cpu_kernel") + # Workgroup and ndrange sizes are always the same + self.wg = (16, 16) + self.ndrange = ( + _idivup(int(self.dimrec_shape[1]), 32) * self.wg[0], + _idivup(int(self.dimrec_shape[0]), 32) * self.wg[1] + ) + # Prepare arguments for the kernel call + if not(self._use_textures): + d_sino_ref = self.d_sino.data + else: + d_sino_ref = self.d_sino_tex + self._backproj_kernel_args = ( + # num of projections (int32) + self.num_projs, + # num of bins (int32) + self.num_bins, + # axis position (float32) + self.axis_pos, + # d_slice (__global float32*) + self.cl_mem["_d_slice"].data, + # d_sino (__read_only image2d_t or float*) + d_sino_ref, + # gpu_offset_x (float32) + np.float32(self.extra_options["gpu_offset_x"]), + # gpu_offset_y (float32) + np.float32(self.extra_options["gpu_offset_y"]), + # d_cos (__global float32*) + self.cl_mem["d_cos"].data, + # d_sin (__global float32*) + self.cl_mem["d_sin"].data, + # d_axis (__global float32*) + self.cl_mem["d_axes"].data, + # shared mem (__local float32*) + self._get_local_mem() + ) + + def _allocate_textures(self): + """ + Allocate the texture for the sinogram. + """ + self.d_sino_tex = self.allocate_texture(self.shape) + + def _init_filter(self, filter_name): + """Filter initialization + + :param str filter_name: filter name + """ + self.filter_name = filter_name or "ram-lak" + self.sino_filter = SinoFilter( + self.shape, + ctx=self.ctx, + filter_name=self.filter_name, + extra_options=self.extra_options, + ) + + def _get_local_mem(self): + return pyopencl.LocalMemory(self.local_mem) # constant for all image sizes + + def _cpy2d_to_slice(self, dst): + ndrange = (int(self.slice_shape[1]), int(self.slice_shape[0])) + slice_shape_ocl = np.int32(ndrange) + wg = None + kernel_args = ( + dst.data, + self.cl_mem["_d_slice"].data, + np.int32(self.slice_shape[1]), + np.int32(self.dimrec_shape[1]), + np.int32((0, 0)), + np.int32((0, 0)), + slice_shape_ocl + ) + return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args) + + def _transfer_to_texture(self, sino): + if isinstance(sino, parray.Array): + return self._transfer_device_to_texture(sino) + sino2 = sino + if not(sino.flags["C_CONTIGUOUS"] and sino.dtype == np.float32): + sino2 = np.ascontiguousarray(sino, dtype=np.float32) + if not(self._use_textures): + ev = pyopencl.enqueue_copy( + self.queue, + self.d_sino.data, + sino2 + ) + what = "transfer filtered sino H->D buffer" + ev.wait() + else: + ev = pyopencl.enqueue_copy( + self.queue, + self.d_sino_tex, + sino2, + origin=(0, 0), + region=self.shape[::-1] + ) + what = "transfer filtered sino H->D texture" + return EventDescription(what, ev) + + def _transfer_device_to_texture(self, d_sino): + if not(self._use_textures): + if id(self.d_sino) == id(d_sino): + return + ev = pyopencl.enqueue_copy( + self.queue, + self.d_sino.data, + d_sino + ) + what = "transfer filtered sino D->D buffer" + ev.wait() + else: + ev = pyopencl.enqueue_copy( + self.queue, + self.d_sino_tex, + d_sino.data, + offset=0, + origin=(0, 0), + region=self.shape[::-1] + ) + what = "transfer filtered sino D->D texture" + return EventDescription(what, ev) + + def backprojection(self, sino, output=None): + """Perform the backprojection on an input sinogram + + :param sino: sinogram. + :param output: optional, output slice. + If provided, the result will be written in this array. + :return: backprojection of sinogram + """ + events = [] + with self.sem: + events.append(self._transfer_to_texture(sino)) + # Call the backprojection kernel + if not(self._use_textures): + kernel_to_call = self.kernels.backproj_cpu_kernel + else: + kernel_to_call = self.kernels.backproj_kernel + kernel_to_call( + self.queue, + self.ndrange, + self.wg, + *self._backproj_kernel_args + ) + # Return + if output is None: + res = self.cl_mem["_d_slice"].get() + res = res[:self.slice_shape[0], :self.slice_shape[1]] + else: + res = output + self._cpy2d_to_slice(output) + + # /with self.sem + if self.profile: + self.events += events + + return res + + def filtered_backprojection(self, sino, output=None): + """ + Compute the filtered backprojection (FBP) on a sinogram. + + :param sino: sinogram (`np.ndarray` or `pyopencl.array.Array`) + with the shape (n_projections, n_bins) + :param output: output (`np.ndarray` or `pyopencl.array.Array`). + If nothing is provided, a new numpy array is returned. + """ + # Filter + self.sino_filter(sino, output=self.d_sino) + # Backproject + res = self.backprojection(self.d_sino, output=output) + return res + + __call__ = filtered_backprojection + + + # ------------------- + # - Compatibility - + # ------------------- + + @deprecated(replacement="Backprojection.sino_filter", since_version="0.10") + def filter_projections(self, sino, rescale=True): + self.sino_filter(sino, output=self.d_sino) + + + +def fourier_filter(sino, filter_=None, fft_size=None): + return fourier_filter_(sino, filter_=filter_, fft_size=fft_size) + diff --git a/src/silx/opencl/codec/__init__.py b/src/silx/opencl/codec/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/silx/opencl/codec/__init__.py diff --git a/src/silx/opencl/codec/byte_offset.py b/src/silx/opencl/codec/byte_offset.py new file mode 100644 index 0000000..9a52427 --- /dev/null +++ b/src/silx/opencl/codec/byte_offset.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Project: Sift implementation in Python + OpenCL +# https://github.com/silx-kit/silx +# +# Copyright (C) 2013-2020 European Synchrotron Radiation Facility, Grenoble, France +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +""" +This module provides a class for CBF byte offset compression/decompression. +""" + +from __future__ import division, print_function, with_statement + +__authors__ = ["Jérôme Kieffer"] +__contact__ = "jerome.kieffer@esrf.eu" +__license__ = "MIT" +__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "11/10/2018" +__status__ = "production" + + +import functools +import os +import numpy +from ..common import ocl, pyopencl +from ..processing import BufferDescription, EventDescription, OpenclProcessing + +import logging +logger = logging.getLogger(__name__) + +if pyopencl: + import pyopencl.version + if pyopencl.version.VERSION < (2016, 0): + from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel + else: + from pyopencl.algorithm import GenericScanKernel + from pyopencl.scan import GenericDebugScanKernel +else: + logger.warning("No PyOpenCL, no byte-offset, please see fabio") + + +class ByteOffset(OpenclProcessing): + """Perform the byte offset compression/decompression on the GPU + + See :class:`OpenclProcessing` for optional arguments description. + + :param int raw_size: + Size of the raw stream for decompression. + It can be (slightly) larger than the array. + :param int dec_size: + Size of the decompression output array + (mandatory for decompression) + """ + + def __init__(self, raw_size=None, dec_size=None, + ctx=None, devicetype="all", + platformid=None, deviceid=None, + block_size=None, profile=False): + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + block_size=block_size, profile=profile) + if self.block_size is None: + self.block_size = self.device.max_work_group_size + wg = self.block_size + + buffers = [BufferDescription("counter", 1, numpy.int32, None)] + + if raw_size is None: + self.raw_size = -1 + self.padded_raw_size = -1 + else: + self.raw_size = int(raw_size) + self.padded_raw_size = int((self.raw_size + wg - 1) & ~(wg - 1)) + buffers += [ + BufferDescription("raw", self.padded_raw_size, numpy.int8, None), + BufferDescription("mask", self.padded_raw_size, numpy.int32, None), + BufferDescription("values", self.padded_raw_size, numpy.int32, None), + BufferDescription("exceptions", self.padded_raw_size, numpy.int32, None) + ] + + if dec_size is None: + self.dec_size = None + else: + self.dec_size = numpy.int32(dec_size) + buffers += [ + BufferDescription("data_float", self.dec_size, numpy.float32, None), + BufferDescription("data_int", self.dec_size, numpy.int32, None) + ] + + self.allocate_buffers(buffers, use_array=True) + + self.compile_kernels([os.path.join("codec", "byte_offset")]) + self.kernels.__setattr__("scan", self._init_double_scan()) + self.kernels.__setattr__("compression_scan", + self._init_compression_scan()) + + def _init_double_scan(self): + """"generates a double scan on indexes and values in one operation""" + arguments = "__global int *value", "__global int *index" + int2 = pyopencl.tools.get_or_register_dtype("int2") + input_expr = "index[i]>0 ? (int2)(0, 0) : (int2)(value[i], 1)" + scan_expr = "a+b" + neutral = "(int2)(0,0)" + output_statement = "value[i] = item.s0; index[i+1] = item.s1;" + + if self.block_size > 256: + knl = GenericScanKernel(self.ctx, + dtype=int2, + arguments=arguments, + input_expr=input_expr, + scan_expr=scan_expr, + neutral=neutral, + output_statement=output_statement) + else: # MacOS on CPU + knl = GenericDebugScanKernel(self.ctx, + dtype=int2, + arguments=arguments, + input_expr=input_expr, + scan_expr=scan_expr, + neutral=neutral, + output_statement=output_statement) + return knl + + def decode(self, raw, as_float=False, out=None): + """This function actually performs the decompression by calling the kernels + + :param numpy.ndarray raw: The compressed data as a 1D numpy array of char. + :param bool as_float: True to decompress as float32, + False (default) to decompress as int32 + :param pyopencl.array out: pyopencl array in which to place the result. + :return: The decompressed image as an pyopencl array. + :rtype: pyopencl.array + """ + assert self.dec_size is not None, \ + "dec_size is a mandatory ByteOffset init argument for decompression" + + events = [] + with self.sem: + len_raw = numpy.int32(len(raw)) + if len_raw > self.padded_raw_size: + wg = self.block_size + self.raw_size = int(len(raw)) + self.padded_raw_size = (self.raw_size + wg - 1) & ~(wg - 1) + logger.info("increase raw buffer size to %s", self.padded_raw_size) + buffers = { + "raw": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int8), + "mask": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32), + "exceptions": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32), + "values": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32), + } + self.cl_mem.update(buffers) + else: + wg = self.block_size + + evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, + raw, + is_blocking=False) + events.append(EventDescription("copy raw H -> D", evt)) + evt = self.kernels.fill_int_mem(self.queue, (self.padded_raw_size,), (wg,), + self.cl_mem["mask"].data, + numpy.int32(self.padded_raw_size), + numpy.int32(0), + numpy.int32(0)) + events.append(EventDescription("memset mask", evt)) + evt = self.kernels.fill_int_mem(self.queue, (1,), (1,), + self.cl_mem["counter"].data, + numpy.int32(1), + numpy.int32(0), + numpy.int32(0)) + events.append(EventDescription("memset counter", evt)) + evt = self.kernels.mark_exceptions(self.queue, (self.padded_raw_size,), (wg,), + self.cl_mem["raw"].data, + len_raw, + numpy.int32(self.raw_size), + self.cl_mem["mask"].data, + self.cl_mem["values"].data, + self.cl_mem["counter"].data, + self.cl_mem["exceptions"].data) + events.append(EventDescription("mark exceptions", evt)) + nb_exceptions = numpy.empty(1, dtype=numpy.int32) + evt = pyopencl.enqueue_copy(self.queue, nb_exceptions, self.cl_mem["counter"].data, + is_blocking=False) + events.append(EventDescription("copy counter D -> H", evt)) + evt.wait() + nbexc = int(nb_exceptions[0]) + if nbexc == 0: + logger.info("nbexc %i", nbexc) + else: + evt = self.kernels.treat_exceptions(self.queue, (nbexc,), (1,), + self.cl_mem["raw"].data, + len_raw, + self.cl_mem["mask"].data, + self.cl_mem["exceptions"].data, + self.cl_mem["values"].data + ) + events.append(EventDescription("treat_exceptions", evt)) + + #self.cl_mem["copy_values"] = self.cl_mem["values"].copy() + #self.cl_mem["copy_mask"] = self.cl_mem["mask"].copy() + evt = self.kernels.scan(self.cl_mem["values"], + self.cl_mem["mask"], + queue=self.queue, + size=int(len_raw), + wait_for=(evt,)) + events.append(EventDescription("double scan", evt)) + #evt.wait() + if out is not None: + if out.dtype == numpy.float32: + copy_results = self.kernels.copy_result_float + else: + copy_results = self.kernels.copy_result_int + else: + if as_float: + out = self.cl_mem["data_float"] + copy_results = self.kernels.copy_result_float + else: + out = self.cl_mem["data_int"] + copy_results = self.kernels.copy_result_int + evt = copy_results(self.queue, (self.padded_raw_size,), (wg,), + self.cl_mem["values"].data, + self.cl_mem["mask"].data, + len_raw, + self.dec_size, + out.data + ) + events.append(EventDescription("copy_results", evt)) + #evt.wait() + if self.profile: + self.events += events + return out + + __call__ = decode + + def _init_compression_scan(self): + """Initialize CBF compression scan kernels""" + preamble = """ + int compressed_size(int diff) { + int abs_diff = abs(diff); + + if (abs_diff < 128) { + return 1; + } + else if (abs_diff < 32768) { + return 3; + } + else { + return 7; + } + } + + void write(const int index, + const int diff, + global char *output) { + int abs_diff = abs(diff); + + if (abs_diff < 128) { + output[index] = (char) diff; + } + else if (abs_diff < 32768) { + output[index] = -128; + output[index + 1] = (char) (diff >> 0); + output[index + 2] = (char) (diff >> 8); + } + else { + output[index] = -128; + output[index + 1] = 0; + output[index + 2] = -128; + output[index + 3] = (char) (diff >> 0); + output[index + 4] = (char) (diff >> 8); + output[index + 5] = (char) (diff >> 16); + output[index + 6] = (char) (diff >> 24); + } + } + """ + arguments = "__global const int *data, __global char *compressed, __global int *size" + input_expr = "compressed_size((i == 0) ? data[0] : (data[i] - data[i - 1]))" + scan_expr = "a+b" + neutral = "0" + output_statement = """ + if (prev_item == 0) { // 1st thread store compressed data size + size[0] = last_item; + } + write(prev_item, (i == 0) ? data[0] : (data[i] - data[i - 1]), compressed); + """ + + if self.block_size >= 64: + knl = GenericScanKernel(self.ctx, + dtype=numpy.int32, + preamble=preamble, + arguments=arguments, + input_expr=input_expr, + scan_expr=scan_expr, + neutral=neutral, + output_statement=output_statement) + else: # MacOS on CPU + knl = GenericDebugScanKernel(self.ctx, + dtype=numpy.int32, + preamble=preamble, + arguments=arguments, + input_expr=input_expr, + scan_expr=scan_expr, + neutral=neutral, + output_statement=output_statement) + return knl + + def encode(self, data, out=None): + """Compress data to CBF. + + :param data: The data to compress as a numpy array + (or a pyopencl Array) of int32. + :type data: Union[numpy.ndarray, pyopencl.array.Array] + :param pyopencl.array out: + pyopencl array of int8 in which to store the result. + The array should be large enough to store the compressed data. + :return: The compressed data as a pyopencl array. + If out is provided, this array shares the backing buffer, + but has the exact size of the compressed data and the queue + of the ByteOffset instance. + :rtype: pyopencl.array + :raises ValueError: if out array is not large enough + """ + + events = [] + with self.sem: + if isinstance(data, pyopencl.array.Array): + d_data = data # Uses provided array + + else: # Copy data to device + data = numpy.ascontiguousarray(data, dtype=numpy.int32).ravel() + + # Make sure data array exists and is large enough + if ("data_input" not in self.cl_mem or + self.cl_mem["data_input"].size < data.size): + logger.info("increase data input buffer size to %s", data.size) + self.cl_mem.update({ + "data_input": pyopencl.array.empty(self.queue, + data.size, + dtype=numpy.int32)}) + d_data = self.cl_mem["data_input"] + + evt = pyopencl.enqueue_copy( + self.queue, d_data.data, data, is_blocking=False) + events.append(EventDescription("copy data H -> D", evt)) + + # Make sure compressed array exists and is large enough + compressed_size = d_data.size * 7 + if ("compressed" not in self.cl_mem or + self.cl_mem["compressed"].size < compressed_size): + logger.info("increase compressed buffer size to %s", compressed_size) + self.cl_mem.update({ + "compressed": pyopencl.array.empty(self.queue, + compressed_size, + dtype=numpy.int8)}) + d_compressed = self.cl_mem["compressed"] + d_size = self.cl_mem["counter"] # Shared with decompression + + evt = self.kernels.compression_scan(d_data, d_compressed, d_size) + events.append(EventDescription("compression scan", evt)) + byte_count = int(d_size.get()[0]) + + if out is None: + # Create out array from a sub-region of the compressed buffer + out = pyopencl.array.Array( + self.queue, + shape=(byte_count,), + dtype=numpy.int8, + allocator=functools.partial( + d_compressed.base_data.get_sub_region, + d_compressed.offset)) + + elif out.size < byte_count: + raise ValueError( + "Provided output buffer is not large enough: " + "requires %d bytes, got %d" % (byte_count, out.size)) + + else: # out.size >= byte_count + # Create an array with a sub-region of out and this class queue + out = pyopencl.array.Array( + self.queue, + shape=(byte_count,), + dtype=numpy.int8, + allocator=functools.partial(out.base_data.get_sub_region, + out.offset)) + + evt = pyopencl.enqueue_copy(self.queue, out.data, d_compressed.data, + byte_count=byte_count) + events.append( + EventDescription("copy D -> D: internal -> out", evt)) + + if self.profile: + self.events += events + + return out + + def encode_to_bytes(self, data): + """Compresses data to CBF and returns compressed data as bytes. + + Usage: + + Provided an image (`image`) stored as a numpy array of int32, + first, create a byte offset compression/decompression object: + + >>> from silx.opencl.codec.byte_offset import ByteOffset + >>> byte_offset_codec = ByteOffset() + + Then, compress an image into bytes: + + >>> compressed = byte_offset_codec.encode_to_bytes(image) + + :param data: The data to compress as a numpy array + (or a pyopencl Array) of int32. + :type data: Union[numpy.ndarray, pyopencl.array.Array] + :return: The compressed data as bytes. + :rtype: bytes + """ + compressed_array = self.encode(data) + return compressed_array.get().tobytes() diff --git a/src/silx/opencl/codec/setup.py b/src/silx/opencl/codec/setup.py new file mode 100644 index 0000000..4a5c1e5 --- /dev/null +++ b/src/silx/opencl/codec/setup.py @@ -0,0 +1,43 @@ +# coding: utf-8 +# +# Copyright (C) 2016-2017 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + +from __future__ import division + +__contact__ = "jerome.kieffer@esrf.eu" +__license__ = "MIT" +__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France" +__authors__ = ["J. Kieffer"] +__date__ = "13/10/2017" + +from numpy.distutils.misc_util import Configuration + + +def configuration(parent_package='', top_path=None): + config = Configuration('codec', parent_package, top_path) + config.add_subpackage('test') + return config + + +if __name__ == "__main__": + from numpy.distutils.core import setup + setup(configuration=configuration) diff --git a/src/silx/opencl/codec/test/__init__.py b/src/silx/opencl/codec/test/__init__.py new file mode 100644 index 0000000..325c2c7 --- /dev/null +++ b/src/silx/opencl/codec/test/__init__.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# +# Project: silx +# https://github.com/silx-kit/silx +# +# Copyright (C) 2013-2017 European Synchrotron Radiation Facility, Grenoble, France +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. diff --git a/src/silx/opencl/codec/test/test_byte_offset.py b/src/silx/opencl/codec/test/test_byte_offset.py new file mode 100644 index 0000000..4b2d5a3 --- /dev/null +++ b/src/silx/opencl/codec/test/test_byte_offset.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Project: Byte-offset decompression in OpenCL +# https://github.com/silx-kit/silx +# +# Copyright (C) 2013-2020 European Synchrotron Radiation Facility, +# Grenoble, France +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +""" +Test suite for byte-offset decompression +""" + +from __future__ import division, print_function + +__authors__ = ["Jérôme Kieffer"] +__contact__ = "jerome.kieffer@esrf.eu" +__license__ = "MIT" +__copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "02/03/2021" + +import sys +import time +import logging +import numpy +from silx.opencl.common import ocl, pyopencl +from silx.opencl.codec import byte_offset +import fabio +import unittest +logger = logging.getLogger(__name__) + + +@unittest.skipUnless(ocl and pyopencl, + "PyOpenCl is missing") +class TestByteOffset(unittest.TestCase): + + @staticmethod + def _create_test_data(shape, nexcept, lam=200): + """Create test (image, compressed stream) pair. + + :param shape: Shape of test image + :param int nexcept: Number of exceptions in the image + :param lam: Expectation of interval argument for numpy.random.poisson + :return: (reference image array, compressed stream) + """ + size = numpy.prod(shape) + ref = numpy.random.poisson(lam, numpy.prod(shape)) + exception_loc = numpy.random.randint(0, size, size=nexcept) + exception_value = numpy.random.randint(0, 1000000, size=nexcept) + ref[exception_loc] = exception_value + ref.shape = shape + + raw = fabio.compression.compByteOffset(ref) + return ref, raw + + def test_decompress(self): + """ + tests the byte offset decompression on GPU + """ + ref, raw = self._create_test_data(shape=(91, 97), nexcept=229) + # ref, raw = self._create_test_data(shape=(7, 9), nexcept=0) + + size = numpy.prod(ref.shape) + + try: + bo = byte_offset.ByteOffset(raw_size=len(raw), dec_size=size, profile=True) + except (RuntimeError, pyopencl.RuntimeError) as err: + logger.warning(err) + if sys.platform == "darwin": + raise unittest.SkipTest("Byte-offset decompression is known to be buggy on MacOS-CPU") + else: + raise err + print(bo.block_size) + + t0 = time.time() + res_cy = fabio.compression.decByteOffset(raw) + t1 = time.time() + res_cl = bo.decode(raw) + t2 = time.time() + delta_cy = abs(ref.ravel() - res_cy).max() + delta_cl = abs(ref.ravel() - res_cl.get()).max() + + logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.", + 1000.0 * (t1 - t0), + 1000.0 * (t2 - t1)) + bo.log_profile() + # print(ref) + # print(res_cl.get()) + self.assertEqual(delta_cy, 0, "Checks fabio works") + self.assertEqual(delta_cl, 0, "Checks opencl works") + + def test_many_decompress(self, ntest=10): + """ + tests the byte offset decompression on GPU, many images to ensure there + is not leaking in memory + """ + shape = (991, 997) + size = numpy.prod(shape) + ref, raw = self._create_test_data(shape=shape, nexcept=0, lam=100) + + try: + bo = byte_offset.ByteOffset(len(raw), size, profile=True) + except (RuntimeError, pyopencl.RuntimeError) as err: + logger.warning(err) + if sys.platform == "darwin": + raise unittest.SkipTest("Byte-offset decompression is known to be buggy on MacOS-CPU") + else: + raise err + t0 = time.time() + res_cy = fabio.compression.decByteOffset(raw) + t1 = time.time() + res_cl = bo(raw) + t2 = time.time() + delta_cy = abs(ref.ravel() - res_cy).max() + delta_cl = abs(ref.ravel() - res_cl.get()).max() + self.assertEqual(delta_cy, 0, "Checks fabio works") + self.assertEqual(delta_cl, 0, "Checks opencl works") + logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.", + 1000.0 * (t1 - t0), + 1000.0 * (t2 - t1)) + + for i in range(ntest): + ref, raw = self._create_test_data(shape=shape, nexcept=2729, lam=200) + + t0 = time.time() + res_cy = fabio.compression.decByteOffset(raw) + t1 = time.time() + res_cl = bo(raw) + t2 = time.time() + delta_cy = abs(ref.ravel() - res_cy).max() + delta_cl = abs(ref.ravel() - res_cl.get()).max() + self.assertEqual(delta_cy, 0, "Checks fabio works #%i" % i) + self.assertEqual(delta_cl, 0, "Checks opencl works #%i" % i) + + logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.", + 1000.0 * (t1 - t0), + 1000.0 * (t2 - t1)) + bo.log_profile(stats=True) + + def test_encode(self): + """Test byte offset compression""" + ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729) + + try: + bo = byte_offset.ByteOffset(len(raw), ref.size, profile=True) + except (RuntimeError, pyopencl.RuntimeError) as err: + logger.warning(err) + raise err + + t0 = time.time() + compressed_array = bo.encode(ref) + t1 = time.time() + + compressed_stream = compressed_array.get().tobytes() + self.assertEqual(raw, compressed_stream) + + logger.debug("Global execution time: OpenCL: %.3fms.", + 1000.0 * (t1 - t0)) + bo.log_profile() + + def test_encode_to_array(self): + """Test byte offset compression while providing an out array""" + + ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729) + + try: + bo = byte_offset.ByteOffset(profile=True) + except (RuntimeError, pyopencl.RuntimeError) as err: + logger.warning(err) + raise err + # Test with out buffer too small + out = pyopencl.array.empty(bo.queue, (10,), numpy.int8) + with self.assertRaises(ValueError): + bo.encode(ref, out) + + # Test with out buffer too big + out = pyopencl.array.empty(bo.queue, (len(raw) + 10,), numpy.int8) + + compressed_array = bo.encode(ref, out) + + # Get size from returned array + compressed_size = compressed_array.size + self.assertEqual(compressed_size, len(raw)) + + # Get data from out array, read it from bo object queue + out_bo_queue = out.with_queue(bo.queue) + compressed_stream = out_bo_queue.get().tobytes()[:compressed_size] + self.assertEqual(raw, compressed_stream) + + def test_encode_to_bytes(self): + """Test byte offset compression to bytes""" + ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729) + + try: + bo = byte_offset.ByteOffset(profile=True) + except (RuntimeError, pyopencl.RuntimeError) as err: + logger.warning(err) + raise err + + t0 = time.time() + res_fabio = fabio.compression.compByteOffset(ref) + t1 = time.time() + compressed_stream = bo.encode_to_bytes(ref) + t2 = time.time() + + self.assertEqual(raw, compressed_stream) + + logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.", + 1000.0 * (t1 - t0), + 1000.0 * (t2 - t1)) + bo.log_profile() + + def test_encode_to_bytes_from_array(self): + """Test byte offset compression to bytes from a pyopencl array. + """ + ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729) + + try: + bo = byte_offset.ByteOffset(profile=True) + except (RuntimeError, pyopencl.RuntimeError) as err: + logger.warning(err) + raise err + + d_ref = pyopencl.array.to_device( + bo.queue, ref.astype(numpy.int32).ravel()) + + t0 = time.time() + res_fabio = fabio.compression.compByteOffset(ref) + t1 = time.time() + compressed_stream = bo.encode_to_bytes(d_ref) + t2 = time.time() + + self.assertEqual(raw, compressed_stream) + + logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.", + 1000.0 * (t1 - t0), + 1000.0 * (t2 - t1)) + bo.log_profile() + + def test_many_encode(self, ntest=10): + """Test byte offset compression with many image""" + shape = (991, 997) + ref, raw = self._create_test_data(shape=shape, nexcept=0, lam=100) + + try: + bo = byte_offset.ByteOffset(profile=False) + except (RuntimeError, pyopencl.RuntimeError) as err: + logger.warning(err) + raise err + + bo_durations = [] + + t0 = time.time() + res_fabio = fabio.compression.compByteOffset(ref) + t1 = time.time() + compressed_stream = bo.encode_to_bytes(ref) + t2 = time.time() + bo_durations.append(1000.0 * (t2 - t1)) + + self.assertEqual(raw, compressed_stream) + logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.", + 1000.0 * (t1 - t0), + 1000.0 * (t2 - t1)) + + for i in range(ntest): + ref, raw = self._create_test_data(shape=shape, nexcept=2729, lam=200) + + t0 = time.time() + res_fabio = fabio.compression.compByteOffset(ref) + t1 = time.time() + compressed_stream = bo.encode_to_bytes(ref) + t2 = time.time() + bo_durations.append(1000.0 * (t2 - t1)) + + self.assertEqual(raw, compressed_stream) + logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.", + 1000.0 * (t1 - t0), + 1000.0 * (t2 - t1)) + + logger.debug("OpenCL execution time: Mean: %fms, Min: %fms, Max: %fms", + numpy.mean(bo_durations), + numpy.min(bo_durations), + numpy.max(bo_durations)) diff --git a/src/silx/opencl/common.py b/src/silx/opencl/common.py new file mode 100644 index 0000000..888b1da --- /dev/null +++ b/src/silx/opencl/common.py @@ -0,0 +1,694 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Project: S I L X project +# https://github.com/silx-kit/silx +# +# Copyright (C) 2012-2021 European Synchrotron Radiation Facility, Grenoble, France +# +# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu) +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +__author__ = "Jerome Kieffer" +__contact__ = "Jerome.Kieffer@ESRF.eu" +__license__ = "MIT" +__copyright__ = "2012-2017 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "29/09/2021" +__status__ = "stable" +__all__ = ["ocl", "pyopencl", "mf", "release_cl_buffers", "allocate_cl_buffers", + "measure_workgroup_size", "kernel_workgroup_size"] + +import os +import logging + +import numpy + +from .utils import get_opencl_code + +logger = logging.getLogger(__name__) + +if os.environ.get("SILX_OPENCL") in ["0", "False"]: + logger.info("Use of OpenCL has been disabled from environment variable: SILX_OPENCL=0") + pyopencl = None +else: + try: + import pyopencl + except ImportError: + logger.warning("Unable to import pyOpenCl. Please install it from: https://pypi.org/project/pyopencl") + pyopencl = None + else: + try: + pyopencl.get_platforms() + except pyopencl.LogicError: + logger.warning("The module pyOpenCL has been imported but can't be used here") + pyopencl = None + else: + import pyopencl.array as array + mf = pyopencl.mem_flags + +if pyopencl is None: + + # Define default mem flags + class mf(object): + WRITE_ONLY = 1 + READ_ONLY = 1 + READ_WRITE = 1 +else: + mf = pyopencl.mem_flags + +FLOP_PER_CORE = {"GPU": 64, # GPU, Fermi at least perform 64 flops per cycle/multicore, G80 were at 24 or 48 ... + "CPU": 4, # CPU, at least intel's have 4 operation per cycle + "ACC": 8} # ACC: the Xeon-phi (MIC) appears to be able to process 8 Flops per hyperthreaded-core + +# Sources : https://en.wikipedia.org/wiki/CUDA +NVIDIA_FLOP_PER_CORE = {(1, 0): 24, # Guessed ! + (1, 1): 24, # Measured on G98 [Quadro NVS 295] + (1, 2): 24, # Guessed ! + (1, 3): 24, # measured on a GT285 (GT200) + (2, 0): 64, # Measured on a 580 (GF110) + (2, 1): 96, # Measured on Quadro2000 GF106GL + (3, 0): 384, # Guessed! + (3, 5): 384, # Measured on K20 + (3, 7): 384, # K80: Guessed! + (5, 0): 256, # Maxwell 4 warps/SM 2 flops/ CU + (5, 2): 256, # Titan-X + (5, 3): 256, # TX1 + (6, 0): 128, # GP100 + (6, 1): 128, # GP104 + (6, 2): 128, # ? + (7, 0): 128, # Volta # measured on Telsa V100 + (7, 2): 128, # Volta ? + (7, 5): 128, # Turing # measured on RTX 6000 + (8, 0): 128, # Ampere # measured on Tesla A100 + (8, 6): 256, # Ampere # measured on RTX A5000 + } + +AMD_FLOP_PER_CORE = 160 # Measured on a M7820 10 core, 700MHz 1120GFlops + + +class Device(object): + """ + Simple class that contains the structure of an OpenCL device + """ + + def __init__(self, name="None", dtype=None, version=None, driver_version=None, + extensions="", memory=None, available=None, + cores=None, frequency=None, flop_core=None, idx=0, workgroup=1): + """ + Simple container with some important data for the OpenCL device description. + + :param name: name of the device + :param dtype: device type: CPU/GPU/ACC... + :param version: driver version + :param driver_version: + :param extensions: List of opencl extensions + :param memory: maximum memory available on the device + :param available: is the device deactivated or not + :param cores: number of SM/cores + :param frequency: frequency of the device + :param flop_core: Flopating Point operation per core per cycle + :param idx: index of the device within the platform + :param workgroup: max workgroup size + """ + self.name = name.strip() + self.type = dtype + self.version = version + self.driver_version = driver_version + self.extensions = extensions.split() + self.memory = memory + self.available = available + self.cores = cores + self.frequency = frequency + self.id = idx + self.max_work_group_size = workgroup + if not flop_core: + flop_core = FLOP_PER_CORE.get(dtype, 1) + if cores and frequency: + self.flops = cores * frequency * flop_core + else: + self.flops = flop_core + + def __repr__(self): + return "%s" % self.name + + def pretty_print(self): + """ + Complete device description + + :return: string + """ + lst = ["Name\t\t:\t%s" % self.name, + "Type\t\t:\t%s" % self.type, + "Memory\t\t:\t%.3f MB" % (self.memory / 2.0 ** 20), + "Cores\t\t:\t%s CU" % self.cores, + "Frequency\t:\t%s MHz" % self.frequency, + "Speed\t\t:\t%.3f GFLOPS" % (self.flops / 1000.), + "Version\t\t:\t%s" % self.version, + "Available\t:\t%s" % self.available] + return os.linesep.join(lst) + + def set_unavailable(self): + """Use this method to flag a faulty device + """ + self.available = False + + +class Platform(object): + """ + Simple class that contains the structure of an OpenCL platform + """ + + def __init__(self, name="None", vendor="None", version=None, extensions=None, idx=0): + """ + Class containing all descriptions of a platform and all devices description within that platform. + + :param name: platform name + :param vendor: name of the brand/vendor + :param version: + :param extensions: list of the extension provided by the platform to all of its devices + :param idx: index of the platform + """ + self.name = name.strip() + self.vendor = vendor.strip() + self.version = version + self.extensions = extensions.split() + self.devices = [] + self.id = idx + + def __repr__(self): + return "%s" % self.name + + def add_device(self, device): + """ + Add new device to the platform + + :param device: Device instance + """ + self.devices.append(device) + + def get_device(self, key): + """ + Return a device according to key + + :param key: identifier for a device, either it's id (int) or it's name + :type key: int or str + """ + out = None + try: + devid = int(key) + except ValueError: + for a_dev in self.devices: + if a_dev.name == key: + out = a_dev + else: + if len(self.devices) > devid > 0: + out = self.devices[devid] + return out + + +def _measure_workgroup_size(device_or_context, fast=False): + """Mesure the maximal work group size of the given device + + DEPRECATED since not perfectly correct ! + + :param device_or_context: instance of pyopencl.Device or pyopencl.Context + or 2-tuple (platformid,deviceid) + :param fast: ask the kernel the valid value, don't probe it + :return: maximum size for the workgroup + """ + if isinstance(device_or_context, pyopencl.Device): + try: + ctx = pyopencl.Context(devices=[device_or_context]) + except pyopencl._cl.LogicError as error: + platform = device_or_context.platform + platformid = pyopencl.get_platforms().index(platform) + deviceid = platform.get_devices().index(device_or_context) + ocl.platforms[platformid].devices[deviceid].set_unavailable() + raise RuntimeError("Unable to create context on %s/%s: %s" % (platform, device_or_context, error)) + else: + device = device_or_context + elif isinstance(device_or_context, pyopencl.Context): + ctx = device_or_context + device = device_or_context.devices[0] + elif isinstance(device_or_context, (tuple, list)) and len(device_or_context) == 2: + ctx = ocl.create_context(platformid=device_or_context[0], + deviceid=device_or_context[1]) + device = ctx.devices[0] + else: + raise RuntimeError("""given parameter device_or_context is not an + instanciation of a device or a context""") + shape = device.max_work_group_size + # get the context + + assert ctx is not None + queue = pyopencl.CommandQueue(ctx) + + max_valid_wg = 1 + data = numpy.random.random(shape).astype(numpy.float32) + d_data = pyopencl.array.to_device(queue, data) + d_data_1 = pyopencl.array.empty_like(d_data) + d_data_1.fill(numpy.float32(1.0)) + + program = pyopencl.Program(ctx, get_opencl_code("addition")).build() + if fast: + max_valid_wg = program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, device) + else: + maxi = int(round(numpy.log2(shape))) + for i in range(maxi + 1): + d_res = pyopencl.array.empty_like(d_data) + wg = 1 << i + try: + evt = program.addition( + queue, (shape,), (wg,), + d_data.data, d_data_1.data, d_res.data, numpy.int32(shape)) + evt.wait() + except Exception as error: + logger.info("%s on device %s for WG=%s/%s", error, device.name, wg, shape) + program = queue = d_res = d_data_1 = d_data = None + break + else: + res = d_res.get() + good = numpy.allclose(res, data + 1) + if good: + if wg > max_valid_wg: + max_valid_wg = wg + else: + logger.warning("ArithmeticError on %s for WG=%s/%s", wg, device.name, shape) + + return max_valid_wg + + +def _is_nvidia_gpu(vendor, devtype): + return (vendor == "NVIDIA Corporation") and (devtype == "GPU") + + +class OpenCL(object): + """ + Simple class that wraps the structure ocl_tools_extended.h + + This is a static class. + ocl should be the only instance and shared among all python modules. + """ + + platforms = [] + nb_devices = 0 + context_cache = {} # key: 2-tuple of int, value: context + if pyopencl: + platform = device = pypl = devtype = extensions = pydev = None + for idx, platform in enumerate(pyopencl.get_platforms()): + pypl = Platform(platform.name, platform.vendor, platform.version, platform.extensions, idx) + for idd, device in enumerate(platform.get_devices()): + #################################################### + # Nvidia does not report int64 atomics (we are using) ... + # this is a hack around as any nvidia GPU with double-precision supports int64 atomics + #################################################### + extensions = device.extensions + if (pypl.vendor == "NVIDIA Corporation") and ('cl_khr_fp64' in extensions): + extensions += ' cl_khr_int64_base_atomics cl_khr_int64_extended_atomics' + try: + devtype = pyopencl.device_type.to_string(device.type).upper() + except ValueError: + # pocl does not describe itself as a CPU ! + devtype = "CPU" + if len(devtype) > 3: + if "GPU" in devtype: + devtype = "GPU" + elif "ACC" in devtype: + devtype = "ACC" + elif "CPU" in devtype: + devtype = "CPU" + else: + devtype = devtype[:3] + if _is_nvidia_gpu(device.vendor, devtype) and ("compute_capability_major_nv" in dir(device)): + try: + comput_cap = device.compute_capability_major_nv, device.compute_capability_minor_nv + except pyopencl.LogicError: + flop_core = FLOP_PER_CORE["GPU"] + else: + flop_core = NVIDIA_FLOP_PER_CORE.get(comput_cap, FLOP_PER_CORE["GPU"]) + elif (pypl.vendor == "Advanced Micro Devices, Inc.") and (devtype == "GPU"): + flop_core = AMD_FLOP_PER_CORE + elif devtype == "CPU": + flop_core = FLOP_PER_CORE.get(devtype, 1) + else: + flop_core = 1 + workgroup = device.max_work_group_size + if (devtype == "CPU") and (pypl.vendor == "Apple"): + logger.info("For Apple's OpenCL on CPU: Measuring actual valid max_work_goup_size.") + workgroup = _measure_workgroup_size(device, fast=True) + if (devtype == "GPU") and os.environ.get("GPU") == "False": + # Environment variable to disable GPU devices + continue + pydev = Device(device.name, devtype, device.version, device.driver_version, extensions, + device.global_mem_size, bool(device.available), device.max_compute_units, + device.max_clock_frequency, flop_core, idd, workgroup) + pypl.add_device(pydev) + nb_devices += 1 + platforms.append(pypl) + del platform, device, pypl, devtype, extensions, pydev + + def __repr__(self): + out = ["OpenCL devices:"] + for platformid, platform in enumerate(self.platforms): + deviceids = ["(%s,%s) %s" % (platformid, deviceid, dev.name) + for deviceid, dev in enumerate(platform.devices)] + out.append("[%s] %s: " % (platformid, platform.name) + ", ".join(deviceids)) + return os.linesep.join(out) + + def get_platform(self, key): + """ + Return a platform according + + :param key: identifier for a platform, either an Id (int) or it's name + :type key: int or str + """ + out = None + try: + platid = int(key) + except ValueError: + for a_plat in self.platforms: + if a_plat.name == key: + out = a_plat + else: + if len(self.platforms) > platid > 0: + out = self.platforms[platid] + return out + + def select_device(self, dtype="ALL", memory=None, extensions=None, best=True, **kwargs): + """ + Select a device based on few parameters (at the end, keep the one with most memory) + + :param dtype: "gpu" or "cpu" or "all" .... + :param memory: minimum amount of memory (int) + :param extensions: list of extensions to be present + :param best: shall we look for the + :returns: A tuple of plateform ID and device ID, else None if nothing + found + """ + if extensions is None: + extensions = [] + if "type" in kwargs: + dtype = kwargs["type"].upper() + else: + dtype = dtype.upper() + if len(dtype) > 3: + dtype = dtype[:3] + best_found = None + for platformid, platform in enumerate(self.platforms): + for deviceid, device in enumerate(platform.devices): + if not device.available: + continue + if (dtype in ["ALL", "DEF"]) or (device.type == dtype): + if (memory is None) or (memory <= device.memory): + found = True + for ext in extensions: + if ext not in device.extensions: + found = False + if found: + if not best: + return platformid, deviceid + else: + if not best_found: + best_found = platformid, deviceid, device.flops + elif best_found[2] < device.flops: + best_found = platformid, deviceid, device.flops + if best_found: + return best_found[0], best_found[1] + + # Nothing found + return None + + def create_context(self, devicetype="ALL", useFp64=False, platformid=None, + deviceid=None, cached=True, memory=None, extensions=None): + """ + Choose a device and initiate a context. + + Devicetypes can be GPU,gpu,CPU,cpu,DEF,ACC,ALL. + Suggested are GPU,CPU. + For each setting to work there must be such an OpenCL device and properly installed. + E.g.: If Nvidia driver is installed, GPU will succeed but CPU will fail. + The AMD SDK kit is required for CPU via OpenCL. + :param devicetype: string in ["cpu","gpu", "all", "acc"] + :param useFp64: boolean specifying if double precision will be used: deprecated use extensions=["cl_khr_fp64"] + :param platformid: integer + :param deviceid: integer + :param cached: True if we want to cache the context + :param memory: minimum amount of memory of the device + :param extensions: list of extensions to be present + :return: OpenCL context on the selected device + """ + if extensions is None: + extensions = [] + if useFp64: + logger.warning("Deprecation: please select your device using the extension name!, i.e. extensions=['cl_khr_fp64']") + extensions.append('cl_khr_fp64') + + if (platformid is not None) and (deviceid is not None): + platformid = int(platformid) + deviceid = int(deviceid) + elif "PYOPENCL_CTX" in os.environ: + pyopencl_ctx = [int(i) if i.isdigit() else 0 for i in os.environ["PYOPENCL_CTX"].split(":")] + pyopencl_ctx += [0] * (2 - len(pyopencl_ctx)) # pad with 0 + platformid, deviceid = pyopencl_ctx + else: + ids = ocl.select_device(type=devicetype, extensions=extensions) + if ids: + platformid, deviceid = ids + ctx = None + if (platformid is not None) and (deviceid is not None): + if (platformid, deviceid) in self.context_cache: + ctx = self.context_cache[(platformid, deviceid)] + else: + try: + ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]]) + except pyopencl._cl.LogicError as error: + self.platforms[platformid].devices[deviceid].set_unavailable() + logger.warning("Unable to create context on %s/%s: %s", platformid, deviceid, error) + ctx = None + else: + if cached: + self.context_cache[(platformid, deviceid)] = ctx + if ctx is None: + logger.warning("Last chance to get an OpenCL device ... probably not the one requested") + ctx = pyopencl.create_some_context(interactive=False) + return ctx + + def device_from_context(self, context): + """ + Retrieves the Device from the context + + :param context: OpenCL context + :return: instance of Device + """ + odevice = context.devices[0] + oplat = odevice.platform + device_id = oplat.get_devices().index(odevice) + platform_id = pyopencl.get_platforms().index(oplat) + return self.platforms[platform_id].devices[device_id] + + +if pyopencl: + ocl = OpenCL() + if ocl.nb_devices == 0: + ocl = None +else: + ocl = None + + +def release_cl_buffers(cl_buffers): + """ + :param cl_buffers: the buffer you want to release + :type cl_buffers: dict(str, pyopencl.Buffer) + + This method release the memory of the buffers store in the dict + """ + for key, buffer_ in cl_buffers.items(): + if buffer_ is not None: + if isinstance(buffer_, pyopencl.array.Array): + try: + buffer_.data.release() + except pyopencl.LogicError: + logger.error("Error while freeing buffer %s", key) + else: + try: + buffer_.release() + except pyopencl.LogicError: + logger.error("Error while freeing buffer %s", key) + cl_buffers[key] = None + return cl_buffers + + +def allocate_cl_buffers(buffers, device=None, context=None): + """ + :param buffers: the buffers info use to create the pyopencl.Buffer + :type buffers: list(std, flag, numpy.dtype, int) + :param device: one of the context device + :param context: opencl contextdevice + :return: a dict containing the instanciated pyopencl.Buffer + :rtype: dict(str, pyopencl.Buffer) + + This method instanciate the pyopencl.Buffer from the buffers + description. + """ + mem = {} + if device is None: + device = ocl.device_from_context(context) + + # check if enough memory is available on the device + ualloc = 0 + for _, _, dtype, size in buffers: + ualloc += numpy.dtype(dtype).itemsize * size + memory = device.memory + logger.info("%.3fMB are needed on device which has %.3fMB", + ualloc / 1.0e6, memory / 1.0e6) + if ualloc >= memory: + memError = "Fatal error in allocate_buffers." + memError += "Not enough device memory for buffers" + memError += "(%lu requested, %lu available)" % (ualloc, memory) + raise MemoryError(memError) # noqa + + # do the allocation + try: + for name, flag, dtype, size in buffers: + mem[name] = pyopencl.Buffer(context, flag, + numpy.dtype(dtype).itemsize * size) + except pyopencl.MemoryError as error: + release_cl_buffers(mem) + raise MemoryError(error) + + return mem + + +def allocate_texture(ctx, shape, hostbuf=None, support_1D=False): + """ + Allocate an OpenCL image ("texture"). + + :param ctx: OpenCL context + :param shape: Shape of the image. Note that pyopencl and OpenCL < 1.2 + do not support 1D images, so 1D images are handled as 2D with one row + :param support_1D: force the image to be 1D if the shape has only one dim + """ + if len(shape) == 1 and not(support_1D): + shape = (1,) + shape + return pyopencl.Image( + ctx, + pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.USE_HOST_PTR, + pyopencl.ImageFormat( + pyopencl.channel_order.INTENSITY, + pyopencl.channel_type.FLOAT + ), + hostbuf=numpy.zeros(shape[::-1], dtype=numpy.float32) + ) + + +def check_textures_availability(ctx): + """ + Check whether textures are supported on the current OpenCL context. + + :param ctx: OpenCL context + """ + try: + dummy_texture = allocate_texture(ctx, (16, 16)) + # Need to further access some attributes (pocl) + dummy_height = dummy_texture.height + textures_available = True + del dummy_texture, dummy_height + except (pyopencl.RuntimeError, pyopencl.LogicError): + textures_available = False + # Nvidia Fermi GPUs (compute capability 2.X) do not support opencl read_imagef + # There is no way to detect this until a kernel is compiled + try: + cc = ctx.devices[0].compute_capability_major_nv + textures_available &= (cc >= 3) + except (pyopencl.LogicError, AttributeError): # probably not a Nvidia GPU + pass + # + return textures_available + + +def measure_workgroup_size(device): + """Measure the actual size of the workgroup + + :param device: device or context or 2-tuple with indexes + :return: the actual measured workgroup size + + if device is "all", returns a dict with all devices with their ids as keys. + """ + if (ocl is None) or (device is None): + return None + + if isinstance(device, tuple) and (len(device) == 2): + # this is probably a tuple (platformid, deviceid) + device = ocl.create_context(platformid=device[0], deviceid=device[1]) + + if device == "all": + res = {} + for pid, platform in enumerate(ocl.platforms): + for did, _devices in enumerate(platform.devices): + tup = (pid, did) + res[tup] = measure_workgroup_size(tup) + else: + res = _measure_workgroup_size(device) + return res + + +def query_kernel_info(program, kernel, what="WORK_GROUP_SIZE"): + """Extract the compile time information from a kernel + + :param program: OpenCL program + :param kernel: kernel or name of the kernel + :param what: what is the query about ? + :return: int or 3-int for the workgroup size. + + Possible information available are: + * 'COMPILE_WORK_GROUP_SIZE': Returns the work-group size specified inside the kernel (__attribute__((reqd_work_gr oup_size(X, Y, Z)))) + * 'GLOBAL_WORK_SIZE': maximum global size that can be used to execute a kernel #OCL2.1! + * 'LOCAL_MEM_SIZE': amount of local memory in bytes being used by the kernel + * 'PREFERRED_WORK_GROUP_SIZE_MULTIPLE': preferred multiple of workgroup size for launch. This is a performance hint. + * 'PRIVATE_MEM_SIZE' Returns the minimum amount of private memory, in bytes, used by each workitem in the kernel + * 'WORK_GROUP_SIZE': maximum work-group size that can be used to execute a kernel on a specific device given by device + + Further information on: + https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html + + """ + assert isinstance(program, pyopencl.Program) + if not isinstance(kernel, pyopencl.Kernel): + kernel_name = kernel + assert kernel in (k.function_name for k in program.all_kernels()), "the kernel exists" + kernel = program.__getattr__(kernel_name) + + device = program.devices[0] + query_wg = getattr(pyopencl.kernel_work_group_info, what) + return kernel.get_work_group_info(query_wg, device) + + +def kernel_workgroup_size(program, kernel): + """Extract the compile time maximum workgroup size + + :param program: OpenCL program + :param kernel: kernel or name of the kernel + :return: the maximum acceptable workgroup size for the given kernel + """ + return query_kernel_info(program, kernel, what="WORK_GROUP_SIZE") diff --git a/src/silx/opencl/conftest.py b/src/silx/opencl/conftest.py new file mode 100644 index 0000000..1fdc516 --- /dev/null +++ b/src/silx/opencl/conftest.py @@ -0,0 +1,5 @@ +import pytest + +@pytest.mark.usefixtures("use_opencl") +def setup_module(module): + pass diff --git a/src/silx/opencl/convolution.py b/src/silx/opencl/convolution.py new file mode 100644 index 0000000..15ef931 --- /dev/null +++ b/src/silx/opencl/convolution.py @@ -0,0 +1,442 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2019 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Module for convolution on CPU/GPU.""" + +from __future__ import absolute_import, print_function, with_statement, division + +__authors__ = ["P. Paleo"] +__license__ = "MIT" +__date__ = "01/08/2019" + +import numpy as np +from copy import copy # python2 +from .common import pyopencl as cl +import pyopencl.array as parray +from .processing import OpenclProcessing, EventDescription +from .utils import ConvolutionInfos + +class Convolution(OpenclProcessing): + """ + A class for performing convolution on CPU/GPU with OpenCL. + """ + + def __init__(self, shape, kernel, axes=None, mode=None, ctx=None, + devicetype="all", platformid=None, deviceid=None, + profile=False, extra_options=None): + """Constructor of OpenCL Convolution. + + :param shape: shape of the array. + :param kernel: convolution kernel (1D, 2D or 3D). + :param axes: axes along which the convolution is performed, + for batched convolutions. + :param mode: Boundary handling mode. Available modes are: + "reflect": cba|abcd|dcb + "nearest": aaa|abcd|ddd + "wrap": bcd|abcd|abc + "constant": 000|abcd|000 + Default is "reflect". + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by + clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param profile: switch on profiling to be able to profile at the kernel + level, store profiling elements (makes code slightly + slower) + :param extra_options: Advanced options (dict). Current options are: + "allocate_input_array": True, + "allocate_output_array": True, + "allocate_tmp_array": True, + "dont_use_textures": False, + """ + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + profile=profile) + + self._configure_extra_options(extra_options) + self._determine_use_case(shape, kernel, axes) + self._allocate_memory(mode) + self._init_kernels() + + def _configure_extra_options(self, extra_options): + self.extra_options = { + "allocate_input_array": True, + "allocate_output_array": True, + "allocate_tmp_array": True, + "dont_use_textures": False, + } + extra_opts = extra_options or {} + self.extra_options.update(extra_opts) + self.use_textures = not(self.extra_options["dont_use_textures"]) + self.use_textures &= self.check_textures_availability() + + def _get_dimensions(self, shape, kernel): + self.shape = shape + self.data_ndim = self._check_dimensions(shape=shape, name="Data") + self.kernel_ndim = self._check_dimensions(arr=kernel, name="Kernel") + Nx = shape[-1] + if self.data_ndim >= 2: + Ny = shape[-2] + else: + Ny = 1 + if self.data_ndim >= 3: + Nz = shape[-3] + else: + Nz = 1 + self.Nx = np.int32(Nx) + self.Ny = np.int32(Ny) + self.Nz = np.int32(Nz) + + def _determine_use_case(self, shape, kernel, axes): + """ + Determine the convolution use case from the input/kernel shape, and axes. + """ + self._get_dimensions(shape, kernel) + if self.kernel_ndim > self.data_ndim: + raise ValueError("Kernel dimensions cannot exceed data dimensions") + data_ndim = self.data_ndim + kernel_ndim = self.kernel_ndim + self.kernel = kernel.astype("f") + + convol_infos = ConvolutionInfos() + k = (data_ndim, kernel_ndim) + if k not in convol_infos.use_cases: + raise ValueError( + "Cannot find a use case for data ndim = %d and kernel ndim = %d" + % (data_ndim, kernel_ndim) + ) + possible_use_cases = convol_infos.use_cases[k] + + self.use_case_name = None + for uc_name, uc_params in possible_use_cases.items(): + if axes in convol_infos.allowed_axes[uc_name]: + self.use_case_name = uc_name + self.use_case_desc = uc_params["name"] + #~ self.use_case_kernels = uc_params["kernels"].copy() + self.use_case_kernels = copy(uc_params["kernels"]) # TODO use the above line once we get rid of python2 + if self.use_case_name is None: + raise ValueError( + "Cannot find a use case for data ndim = %d, kernel ndim = %d and axes=%s" + % (data_ndim, kernel_ndim, str(axes)) + ) + # TODO implement this use case + if self.use_case_name == "batched_separable_2D_1D_3D": + raise NotImplementedError( + "The use case %s is not implemented" + % self.use_case_name + ) + # + self.axes = axes + # Replace "axes=None" with an actual value (except for ND-ND) + allowed_axes = convol_infos.allowed_axes[self.use_case_name] + if len(allowed_axes) > 1: + # The default choice might impact perfs + self.axes = allowed_axes[0] or allowed_axes[1] + self.separable = self.use_case_name.startswith("separable") + self.batched = self.use_case_name.startswith("batched") + # Update kernel names when using textures + if self.use_textures: + for i, kern_name in enumerate(self.use_case_kernels): + self.use_case_kernels[i] = kern_name + "_tex" + + def _allocate_memory(self, mode): + self.mode = mode or "reflect" + option_array_names = { + "allocate_input_array": "data_in", + "allocate_output_array": "data_out", + "allocate_tmp_array": "data_tmp", + } + # Nonseparable transforms do not need tmp array + if not(self.separable): + self.extra_options["allocate_tmp_array"] = False + # Allocate arrays + for option_name, array_name in option_array_names.items(): + if self.extra_options[option_name]: + value = parray.empty(self.queue, self.shape, np.float32) + value.fill(np.float32(0.0)) + else: + value = None + setattr(self, array_name, value) + + if isinstance(self.kernel, np.ndarray): + self.d_kernel = parray.to_device(self.queue, self.kernel) + else: + if not(isinstance(self.kernel, parray.Array)): + raise ValueError("kernel must be either numpy array or pyopencl array") + self.d_kernel = self.kernel + self._old_input_ref = None + self._old_output_ref = None + if self.use_textures: + self._allocate_textures() + self._c_modes_mapping = { + "periodic": 2, + "wrap": 2, + "nearest": 1, + "replicate": 1, + "reflect": 0, + "constant": 3, + } + mp = self._c_modes_mapping + if self.mode.lower() not in mp: + raise ValueError( + """ + Mode %s is not available for textures. Available modes are: + %s + """ + % (self.mode, str(mp.keys())) + ) + # TODO + if not(self.use_textures) and self.mode.lower() == "constant": + raise NotImplementedError( + "mode='constant' is not implemented without textures yet" + ) + # + self._c_conv_mode = mp[self.mode] + + def _allocate_textures(self): + self.data_in_tex = self.allocate_texture(self.shape) + self.d_kernel_tex = self.allocate_texture(self.kernel.shape) + self.transfer_to_texture(self.d_kernel, self.d_kernel_tex) + + def _init_kernels(self): + if self.kernel_ndim > 1: + if np.abs(np.diff(self.kernel.shape)).max() > 0: + raise NotImplementedError( + "Non-separable convolution with non-square kernels is not implemented yet" + ) + compile_options = [str("-DUSED_CONV_MODE=%d" % self._c_conv_mode)] + if self.use_textures: + kernel_files = ["convolution_textures.cl"] + compile_options.extend([ + str("-DIMAGE_DIMS=%d" % self.data_ndim), + str("-DFILTER_DIMS=%d" % self.kernel_ndim), + ]) + d_kernel_ref = self.d_kernel_tex + else: + kernel_files = ["convolution.cl"] + d_kernel_ref = self.d_kernel.data + self.compile_kernels( + kernel_files=kernel_files, + compile_options=compile_options + ) + self.ndrange = self.shape[::-1] + self.wg = None + kernel_args = [ + self.queue, + self.ndrange, self.wg, + None, + None, + d_kernel_ref, + np.int32(self.kernel.shape[0]), + self.Nx, self.Ny, self.Nz + ] + if self.kernel_ndim == 2: + kernel_args.insert(6, np.int32(self.kernel.shape[1])) + if self.kernel_ndim == 3: + kernel_args.insert(6, np.int32(self.kernel.shape[2])) + kernel_args.insert(7, np.int32(self.kernel.shape[1])) + self.kernel_args = tuple(kernel_args) + # If self.data_tmp is allocated, separable transforms can be performed + # by a series of batched transforms, without any copy, by swapping refs. + self.swap_pattern = None + if self.separable: + if self.data_tmp is not None: + self.swap_pattern = { + 2: [ + ("data_in", "data_tmp"), + ("data_tmp", "data_out") + ], + 3: [ + ("data_in", "data_out"), + ("data_out", "data_tmp"), + ("data_tmp", "data_out"), + ], + } + else: + # TODO + raise NotImplementedError("For now, data_tmp has to be allocated") + + def _get_swapped_arrays(self, i): + """ + Get the input and output arrays to use when using a "swap pattern". + Swapping refs enables to avoid copies between temp. array and output. + For example, a separable 2D->1D convolution on 2D data reads: + data_tmp = convol(data_input, kernel, axis=1) # step i=0 + data_out = convol(data_tmp, kernel, axis=0) # step i=1 + + :param i: current step number of the separable convolution + """ + if self.use_textures: + # copy is needed when using texture, as data_out is a Buffer + if i > 0: + self.transfer_to_texture(self.data_out, self.data_in_tex) + return self.data_in_tex, self.data_out + n_batchs = len(self.axes) + in_ref, out_ref = self.swap_pattern[n_batchs][i] + d_in = getattr(self, in_ref) + d_out = getattr(self, out_ref) + return d_in, d_out + + def _configure_kernel_args(self, opencl_kernel_args, input_ref, output_ref): + # TODO more elegant + if isinstance(input_ref, parray.Array): + input_ref = input_ref.data + if isinstance(output_ref, parray.Array): + output_ref = output_ref.data + if input_ref is not None or output_ref is not None: + opencl_kernel_args = list(opencl_kernel_args) + if input_ref is not None: + opencl_kernel_args[3] = input_ref + if output_ref is not None: + opencl_kernel_args[4] = output_ref + opencl_kernel_args = tuple(opencl_kernel_args) + return opencl_kernel_args + + @staticmethod + def _check_dimensions(arr=None, shape=None, name="", dim_min=1, dim_max=3): + if shape is not None: + ndim = len(shape) + elif arr is not None: + ndim = arr.ndim + else: + raise ValueError("Please provide either arr= or shape=") + if ndim < dim_min or ndim > dim_max: + raise ValueError("%s dimensions should be between %d and %d" + % (name, dim_min, dim_max) + ) + return ndim + + def _check_array(self, arr): + # TODO allow cl.Buffer + if not(isinstance(arr, parray.Array) or isinstance(arr, np.ndarray)): + raise TypeError("Expected either pyopencl.array.Array or numpy.ndarray") + # TODO composition with ImageProcessing/cast + if arr.dtype != np.float32: + raise TypeError("Data must be float32") + if arr.shape != self.shape: + raise ValueError("Expected data shape = %s" % str(self.shape)) + + def _set_arrays(self, array, output=None): + # When using textures: copy + if self.use_textures: + self.transfer_to_texture(array, self.data_in_tex) + data_in_ref = self.data_in_tex + else: + # Otherwise: copy H->D or update references. + if isinstance(array, np.ndarray): + self.data_in[:] = array[:] + else: + self._old_input_ref = self.data_in + self.data_in = array + data_in_ref = self.data_in + if output is not None: + if not(isinstance(output, np.ndarray)): + self._old_output_ref = self.data_out + self.data_out = output + # Update OpenCL kernel arguments with new array references + self.kernel_args = self._configure_kernel_args( + self.kernel_args, + data_in_ref, + self.data_out + ) + + def _separable_convolution(self): + assert len(self.axes) == len(self.use_case_kernels) + # Separable: one kernel call per data dimension + for i, axis in enumerate(self.axes): + in_ref, out_ref = self._get_swapped_arrays(i) + self._batched_convolution(axis, input_ref=in_ref, output_ref=out_ref) + + def _batched_convolution(self, axis, input_ref=None, output_ref=None): + # Batched: one kernel call in total + opencl_kernel = self.kernels.get_kernel(self.use_case_kernels[axis]) + opencl_kernel_args = self._configure_kernel_args( + self.kernel_args, + input_ref, + output_ref + ) + ev = opencl_kernel(*opencl_kernel_args) + if self.profile: + self.events.append(EventDescription("batched convolution", ev)) + + def _nd_convolution(self): + assert len(self.use_case_kernels) == 1 + opencl_kernel = self.kernels.get_kernel(self.use_case_kernels[0]) + ev = opencl_kernel(*self.kernel_args) + if self.profile: + self.events.append(EventDescription("ND convolution", ev)) + + def _recover_arrays_references(self): + if self._old_input_ref is not None: + self.data_in = self._old_input_ref + self._old_input_ref = None + if self._old_output_ref is not None: + self.data_out = self._old_output_ref + self._old_output_ref = None + self.kernel_args = self._configure_kernel_args( + self.kernel_args, + self.data_in, + self.data_out + ) + + def _get_output(self, output): + if output is None: + res = self.data_out.get() + else: + res = output + if isinstance(output, np.ndarray): + output[:] = self.data_out[:] + self._recover_arrays_references() + return res + + def convolve(self, array, output=None): + """ + Convolve an array with the class kernel. + + :param array: Input array. Can be numpy.ndarray or pyopencl.array.Array. + :param output: Output array. Can be numpy.ndarray or pyopencl.array.Array. + """ + self._check_array(array) + self._set_arrays(array, output=output) + if self.axes is not None: + if self.separable: + self._separable_convolution() + elif self.batched: + assert len(self.axes) == 1 + self._batched_convolution(self.axes[0]) + # else: ND-ND convol + else: + # ND-ND convol + self._nd_convolution() + + res = self._get_output(output) + return res + + + __call__ = convolve + + diff --git a/src/silx/opencl/image.py b/src/silx/opencl/image.py new file mode 100644 index 0000000..65e2d5e --- /dev/null +++ b/src/silx/opencl/image.py @@ -0,0 +1,387 @@ +# -*- coding: utf-8 -*- +# +# Project: silx +# https://github.com/silx-kit/silx +# +# Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France +# +# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# . +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# . +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +"""A general purpose library for manipulating 2D images in 1 or 3 colors + +""" +from __future__ import absolute_import, print_function, with_statement, division + + +__author__ = "Jerome Kieffer" +__license__ = "MIT" +__date__ = "12/02/2018" +__copyright__ = "2012-2017, ESRF, Grenoble" +__contact__ = "jerome.kieffer@esrf.fr" + +import os +import logging +import numpy +from collections import OrderedDict +from math import floor, ceil, sqrt, log + +from .common import pyopencl, kernel_workgroup_size +from .processing import EventDescription, OpenclProcessing, BufferDescription + +if pyopencl: + mf = pyopencl.mem_flags +logger = logging.getLogger(__name__) + + +class ImageProcessing(OpenclProcessing): + + kernel_files = ["cast", "map", "max_min", "histogram"] + + converter = {numpy.dtype(numpy.uint8): "u8_to_float", + numpy.dtype(numpy.int8): "s8_to_float", + numpy.dtype(numpy.uint16): "u16_to_float", + numpy.dtype(numpy.int16): "s16_to_float", + numpy.dtype(numpy.uint32): "u32_to_float", + numpy.dtype(numpy.int32): "s32_to_float", + } + + def __init__(self, shape=None, ncolors=1, template=None, + ctx=None, devicetype="all", platformid=None, deviceid=None, + block_size=None, memory=None, profile=False): + """Constructor of the ImageProcessing class + + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param block_size: preferred workgroup size, may vary depending on the + out come of the compilation + :param memory: minimum memory available on device + :param profile: switch on profiling to be able to profile at the kernel + level, store profiling elements (makes code slightly slower) + """ + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + block_size=block_size, memory=memory, profile=profile) + if template is not None: + shape = template.shape + if len(shape) > 2: + self.ncolors = shape[2] + self.shape = shape[:2] + else: + self.ncolors = 1 + self.shape = shape + else: + self.ncolors = ncolors + self.shape = shape + assert shape is not None + self.buffer_shape = self.shape if self.ncolors == 1 else self.shape + (self.ncolors,) + kernel_files = [os.path.join("image", i) for i in self.kernel_files] + self.compile_kernels(kernel_files, + compile_options="-DNB_COLOR=%i" % self.ncolors) + if self.ncolors == 1: + img_shape = self.shape + else: + img_shape = self.shape + (self.ncolors,) + + buffers = [BufferDescription("image0_d", img_shape, numpy.float32, None), + BufferDescription("image1_d", img_shape, numpy.float32, None), + BufferDescription("image2_d", img_shape, numpy.float32, None), + BufferDescription("max_min_d", 2, numpy.float32, None), + BufferDescription("cnt_d", 1, numpy.int32, None), ] + # Temporary buffer for max-min reduction + self.wg_red = kernel_workgroup_size(self.program, self.kernels.max_min_reduction_stage1) + if self.wg_red > 1: + self.wg_red = min(self.wg_red, + numpy.int32(1 << int(floor(log(sqrt(numpy.prod(self.shape)), 2))))) + tmp = BufferDescription("tmp_max_min_d", 2 * self.wg_red, numpy.float32, None) + buffers.append(tmp) + self.allocate_buffers(buffers, use_array=True) + self.cl_mem["cnt_d"].fill(0) + + def __repr__(self): + return "ImageProcessing for shape=%s, %i colors initalized on %s" % \ + (self.shape, self.ncolors, self.ctx.devices[0].name) + + def _get_in_out_buffers(self, img=None, copy=True, out=None, + out_dtype=None, out_size=None): + """Internal method used to select the proper buffers before processing. + + :param img: expects a numpy array or a pyopencl.array of dim 2 or 3 + :param copy: set to False to directly re-use a pyopencl array + :param out: provide an output buffer to store the result + :param out_dtype: enforce the type of the output buffer (optional) + :param out_size: enforce the size of the output buffer (optional) + :return: input_buffer, output_buffer + + Nota: this is not locked. + """ + events = [] + if out is not None and isinstance(out, pyopencl.array.Array): + if (out_size or out_dtype) is not None: + if out_size is not None: + assert out.size > out_size + if out_dtype is not None: + assert out_dtype == out.dtype + else: # assume it is same size and type as weoking buffer + assert out.shape == self.buffer_shape + assert out.dtype == numpy.float32 + out.finish() + output_array = out + else: + if out_dtype != numpy.float32 and out_size: + name = "%s_%s_d" % (numpy.dtype(out_dtype), out_size) + if name not in self.cl_mem: + output_array = self.cl_mem[name] = pyopencl.array.empty(self.queue, (out_size,), out_dtype) + else: + output_array = self.cl_mem[name] + else: + output_array = self.cl_mem["image2_d"] + + if img is None: + input_array = self.cl_mem["image1_d"] + if isinstance(img, pyopencl.array.Array): + if copy: + evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, img.data) + input_array = self.cl_mem["image1_d"] + events.append(EventDescription("copy D->D", evt)) + else: + img.finish() + input_array = img + evt = None + else: + # assume this is numpy + if img.dtype.itemsize > 4: + logger.warning("Casting to float32 on CPU") + evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, numpy.ascontiguousarray(img, numpy.float32)) + input_array = self.cl_mem["image1_d"] + events.append(EventDescription("cast+copy H->D", evt)) + else: + evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, numpy.ascontiguousarray(img)) + input_array = self.cl_mem["image1_d"] + events.append(EventDescription("copy H->D", evt)) + if self.profile: + self.events += events + return input_array, output_array + + def to_float(self, img, copy=True, out=None): + """ Takes any array and convert it to a float array for ease of processing. + + :param img: expects a numpy array or a pyopencl.array of dim 2 or 3 + :param copy: set to False to directly re-use a pyopencl array + :param out: provide an output buffer to store the result + """ + assert img.shape == self.buffer_shape + + events = [] + with self.sem: + input_array, output_array = self._get_in_out_buffers(img, copy, out) + if (img.dtype.itemsize > 4) or (img.dtype == numpy.float32): + # copy device -> device, already there as float32 + ev = pyopencl.enqueue_copy(self.queue, output_array.data, input_array.data) + events.append(EventDescription("copy D->D", ev)) + else: + # Cast to float: + name = self.converter[img.dtype] + kernel = self.kernels.get_kernel(name) + ev = kernel(self.queue, (self.shape[1], self.shape[0]), None, + input_array.data, output_array.data, + numpy.int32(self.shape[1]), numpy.int32(self.shape[0]) + ) + events.append(EventDescription("cast %s" % name, ev)) + + if self.profile: + self.events += events + if out is None: + res = output_array.get() + return res + else: + output_array.finish() + return output_array + + def normalize(self, img, mini=0.0, maxi=1.0, copy=True, out=None): + """Scale the intensity of the image so that the minimum is 0 and the + maximum is 1.0 (or any value suggested). + + :param img: numpy array or pyopencl array of dim 2 or 3 and of type float + :param mini: Expected minimum value + :param maxi: expected maxiumum value + :param copy: set to False to use directly the input buffer + :param out: provides an output buffer. prevents a copy D->H + + This uses a min/max reduction in two stages plus a map operation + """ + assert img.shape == self.buffer_shape + events = [] + with self.sem: + input_array, output_array = self._get_in_out_buffers(img, copy, out) + size = numpy.int32(numpy.prod(self.shape)) + if self.wg_red == 1: + # Probably on MacOS CPU WG==1 --> serial code. + kernel = self.kernels.get_kernel("max_min_serial") + evt = kernel(self.queue, (1,), (1,), + input_array.data, + size, + self.cl_mem["max_min_d"].data) + ed = EventDescription("max_min_serial", evt) + events.append(ed) + else: + stage1 = self.kernels.max_min_reduction_stage1 + stage2 = self.kernels.max_min_reduction_stage2 + local_mem = pyopencl.LocalMemory(int(self.wg_red * 8)) + k1 = stage1(self.queue, (int(self.wg_red ** 2),), (int(self.wg_red),), + input_array.data, + self.cl_mem["tmp_max_min_d"].data, + size, + local_mem) + k2 = stage2(self.queue, (int(self.wg_red),), (int(self.wg_red),), + self.cl_mem["tmp_max_min_d"].data, + self.cl_mem["max_min_d"].data, + local_mem) + + events += [EventDescription("max_min_stage1", k1), + EventDescription("max_min_stage2", k2)] + + evt = self.kernels.normalize_image(self.queue, (self.shape[1], self.shape[0]), None, + input_array.data, output_array.data, + numpy.int32(self.shape[1]), numpy.int32(self.shape[0]), + self.cl_mem["max_min_d"].data, + numpy.float32(mini), numpy.float32(maxi)) + events.append(EventDescription("normalize", evt)) + if self.profile: + self.events += events + + if out is None: + res = output_array.get() + return res + else: + output_array.finish() + return output_array + + def histogram(self, img=None, nbins=255, range=None, + log_scale=False, copy=True, out=None): + """Compute the histogram of a set of data. + + :param img: input image. If None, use the one already on the device + :param nbins: number of bins + :param range: the lower and upper range of the bins. If not provided, + range is simply ``(a.min(), a.max())``. Values outside the + range are ignored. The first element of the range must be + less than or equal to the second. + :param log_scale: perform the binning in lograrithmic scale. + Open to extension + :param copy: unset to directly use the input buffer without copy + :param out: use a provided array for offering the result + :return: histogram (size=nbins), edges (size=nbins+1) + API similar to numpy + """ + assert img.shape == self.buffer_shape + + input_array = self.to_float(img, copy=copy, out=self.cl_mem["image0_d"]) + events = [] + with self.sem: + input_array, output_array = self._get_in_out_buffers(input_array, copy=False, + out=out, + out_dtype=numpy.int32, + out_size=nbins) + + if range is None: + # measure actually the bounds + size = numpy.int32(numpy.prod(self.shape)) + if self.wg_red == 1: + # Probably on MacOS CPU WG==1 --> serial code. + kernel = self.kernels.get_kernel("max_min_serial") + + evt = kernel(self.queue, (1,), (1,), + input_array.data, + size, + self.cl_mem["max_min_d"].data) + events.append(EventDescription("max_min_serial", evt)) + else: + stage1 = self.kernels.max_min_reduction_stage1 + stage2 = self.kernels.max_min_reduction_stage2 + local_mem = pyopencl.LocalMemory(int(self.wg_red * 2 * numpy.dtype("float32").itemsize)) + k1 = stage1(self.queue, (int(self.wg_red ** 2),), (int(self.wg_red),), + input_array.data, + self.cl_mem["tmp_max_min_d"].data, + size, + local_mem) + k2 = stage2(self.queue, (int(self.wg_red),), (int(self.wg_red),), + self.cl_mem["tmp_max_min_d"].data, + self.cl_mem["max_min_d"].data, + local_mem) + + events += [EventDescription("max_min_stage1", k1), + EventDescription("max_min_stage2", k2)] + maxi, mini = self.cl_mem["max_min_d"].get() + else: + mini = numpy.float32(min(range)) + maxi = numpy.float32(max(range)) + device = self.ctx.devices[0] + nb_engines = device.max_compute_units + tmp_size = nb_engines * nbins + name = "tmp_int32_%s_d" % (tmp_size) + if name not in self.cl_mem: + tmp_array = self.cl_mem[name] = pyopencl.array.empty(self.queue, (tmp_size,), numpy.int32) + else: + tmp_array = self.cl_mem[name] + + edge_name = "tmp_float32_%s_d" % (nbins + 1) + if edge_name not in self.cl_mem: + edges_array = self.cl_mem[edge_name] = pyopencl.array.empty(self.queue, (nbins + 1,), numpy.float32) + else: + edges_array = self.cl_mem[edge_name] + + shared = pyopencl.LocalMemory(numpy.dtype(numpy.int32).itemsize * nbins) + + # Handle log-scale + if log_scale: + map_operation = numpy.int32(1) + else: + map_operation = numpy.int32(0) + kernel = self.kernels.get_kernel("histogram") + wg = min(device.max_work_group_size, + 1 << (int(ceil(log(nbins, 2)))), + self.kernels.max_workgroup_size(kernel)) + evt = kernel(self.queue, (wg * nb_engines,), (wg,), + input_array.data, + numpy.int32(input_array.size), + mini, + maxi, + map_operation, + output_array.data, + edges_array.data, + numpy.int32(nbins), + tmp_array.data, + self.cl_mem["cnt_d"].data, + shared) + events.append(EventDescription("histogram", evt)) + + if self.profile: + self.events += events + + if out is None: + res = output_array.get() + return res, edges_array.get() + else: + output_array.finish() + return output_array, edges_array diff --git a/src/silx/opencl/linalg.py b/src/silx/opencl/linalg.py new file mode 100644 index 0000000..a64122a --- /dev/null +++ b/src/silx/opencl/linalg.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2016 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Module for basic linear algebra in OpenCL""" + +from __future__ import absolute_import, print_function, with_statement, division + +__authors__ = ["P. Paleo"] +__license__ = "MIT" +__date__ = "01/08/2019" + +import numpy as np + +from .common import pyopencl +from .processing import EventDescription, OpenclProcessing + +import pyopencl.array as parray +cl = pyopencl + + +class LinAlg(OpenclProcessing): + + kernel_files = ["linalg.cl"] + + def __init__(self, shape, do_checks=False, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False): + """ + Create a "Linear Algebra" plan for a given image shape. + + :param shape: shape of the image (num_rows, num_columns) + :param do_checks (optional): if True, memory and data type checks are performed when possible. + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param profile: switch on profiling to be able to profile at the kernel level, + store profiling elements (makes code slightly slower) + + """ + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + profile=profile) + + self.d_gradient = parray.empty(self.queue, shape, np.complex64) + self.d_gradient.fill(np.complex64(0.0)) + self.d_image = parray.empty(self.queue, shape, np.float32) + self.d_image.fill(np.float32(0.0)) + self.add_to_cl_mem({ + "d_gradient": self.d_gradient, + "d_image": self.d_image + }) + + self.wg2D = None + self.shape = shape + self.ndrange2D = ( + int(self.shape[1]), + int(self.shape[0]) + ) + self.do_checks = bool(do_checks) + OpenclProcessing.compile_kernels(self, self.kernel_files) + + @staticmethod + def check_array(array, dtype, shape, arg_name): + if array.shape != shape or array.dtype != dtype: + raise ValueError("%s should be a %s array of type %s" %(arg_name, str(shape), str(dtype))) + + def get_data_references(self, src, dst, default_src_ref, default_dst_ref): + """ + From various types of src and dst arrays, + returns the references to the underlying data (Buffer) that will be used by the OpenCL kernels. + # TODO documentation + + This function will make a copy host->device if the input is on host (eg. numpy array) + """ + if dst is not None: + if isinstance(dst, cl.array.Array): + dst_ref = dst.data + elif isinstance(dst, cl.Buffer): + dst_ref = dst + else: + raise ValueError("dst should be either pyopencl.array.Array or pyopencl.Buffer") + else: + dst_ref = default_dst_ref + + if isinstance(src, cl.array.Array): + src_ref = src.data + elif isinstance(src, cl.Buffer): + src_ref = src + else: # assuming numpy.ndarray + evt = cl.enqueue_copy(self.queue, default_src_ref, src) + self.events.append(EventDescription("copy H->D", evt)) + src_ref = default_src_ref + return src_ref, dst_ref + + def gradient(self, image, dst=None, return_to_host=False): + """ + Compute the spatial gradient of an image. + The gradient is computed with first-order difference (not central difference). + + :param image: image to compute the gradient from. It can be either a numpy.ndarray, a pyopencl Array or Buffer. + :param dst: optional, reference to a destination pyopencl Array or Buffer. It must be of complex64 data type. + :param return_to_host: optional, set to True if you want the result to be transferred back to host. + + if dst is provided, it should be of type numpy.complex64 ! + """ + n_y, n_x = np.int32(self.shape) + if self.do_checks: + self.check_array(image, np.float32, self.shape, "image") + if dst is not None: + self.check_array(dst, np.complex64, self.shape, "dst") + img_ref, grad_ref = self.get_data_references(image, dst, self.d_image.data, self.d_gradient.data) + + # Prepare the kernel call + kernel_args = [ + img_ref, + grad_ref, + n_x, + n_y + ] + # Call the gradient kernel + evt = self.kernels.kern_gradient2D( + self.queue, + self.ndrange2D, + self.wg2D, + *kernel_args + ) + self.events.append(EventDescription("gradient2D", evt)) + # TODO: should the wait be done in any case ? + # In the case where dst=None, the wait() is mandatory since a user will be doing arithmetic on dst afterwards + if dst is None: + evt.wait() + + if return_to_host: + if dst is not None: + res_tmp = self.d_gradient.get() + else: + res_tmp = np.zeros(self.shape, dtype=np.complex64) + cl.enqueue_copy(self.queue, res_tmp, grad_ref) + res = np.zeros((2,) + self.shape, dtype=np.float32) + res[0] = np.copy(res_tmp.real) + res[1] = np.copy(res_tmp.imag) + return res + else: + return dst + + def divergence(self, gradient, dst=None, return_to_host=False): + """ + Compute the spatial divergence of an image. + The divergence is designed to be the (negative) adjoint of the gradient. + + :param gradient: gradient-like array to compute the divergence from. It can be either a numpy.ndarray, a pyopencl Array or Buffer. + :param dst: optional, reference to a destination pyopencl Array or Buffer. It must be of complex64 data type. + :param return_to_host: optional, set to True if you want the result to be transferred back to host. + + if dst is provided, it should be of type numpy.complex64 ! + """ + n_y, n_x = np.int32(self.shape) + # numpy.ndarray gradients are expected to be (2, n_y, n_x) + if isinstance(gradient, np.ndarray): + gradient2 = np.zeros(self.shape, dtype=np.complex64) + gradient2.real = np.copy(gradient[0]) + gradient2.imag = np.copy(gradient[1]) + gradient = gradient2 + elif self.do_checks: + self.check_array(gradient, np.complex64, self.shape, "gradient") + if dst is not None: + self.check_array(dst, np.float32, self.shape, "dst") + grad_ref, img_ref = self.get_data_references(gradient, dst, self.d_gradient.data, self.d_image.data) + + # Prepare the kernel call + kernel_args = [ + grad_ref, + img_ref, + n_x, + n_y + ] + # Call the gradient kernel + evt = self.kernels.kern_divergence2D( + self.queue, + self.ndrange2D, + self.wg2D, + *kernel_args + ) + self.events.append(EventDescription("divergence2D", evt)) + # TODO: should the wait be done in any case ? + # In the case where dst=None, the wait() is mandatory since a user will be doing arithmetic on dst afterwards + if dst is None: + evt.wait() + + if return_to_host: + if dst is not None: + res = self.d_image.get() + else: + res = np.zeros(self.shape, dtype=np.float32) + cl.enqueue_copy(self.queue, res, img_ref) + return res + else: + return dst diff --git a/src/silx/opencl/medfilt.py b/src/silx/opencl/medfilt.py new file mode 100644 index 0000000..d4e425b --- /dev/null +++ b/src/silx/opencl/medfilt.py @@ -0,0 +1,269 @@ +# -*- coding: utf-8 -*- +# +# Project: Azimuthal integration +# https://github.com/silx-kit/pyFAI +# +# Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France +# +# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# . +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# . +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +"""A module for performing the 1d, 2d and 3d median filter ... + +The target is to mimic the signature of scipy.signal.medfilt and scipy.medfilt2 + +The first implementation targets 2D implementation where this operation is costly (~10s/2kx2k image) +""" +from __future__ import absolute_import, print_function, with_statement, division + + +__author__ = "Jerome Kieffer" +__license__ = "MIT" +__date__ = "12/09/2017" +__copyright__ = "2012-2017, ESRF, Grenoble" +__contact__ = "jerome.kieffer@esrf.fr" + +import logging +import numpy +from collections import OrderedDict + +from .common import pyopencl, kernel_workgroup_size +from .processing import EventDescription, OpenclProcessing, BufferDescription + +if pyopencl: + mf = pyopencl.mem_flags +else: + raise ImportError("pyopencl is not installed") +logger = logging.getLogger(__name__) + + +class MedianFilter2D(OpenclProcessing): + """A class for doing median filtering using OpenCL""" + buffers = [ + BufferDescription("result", 1, numpy.float32, mf.WRITE_ONLY), + BufferDescription("image_raw", 1, numpy.float32, mf.READ_ONLY), + BufferDescription("image", 1, numpy.float32, mf.READ_WRITE), + ] + kernel_files = ["preprocess.cl", "bitonic.cl", "medfilt.cl"] + mapping = {numpy.int8: "s8_to_float", + numpy.uint8: "u8_to_float", + numpy.int16: "s16_to_float", + numpy.uint16: "u16_to_float", + numpy.uint32: "u32_to_float", + numpy.int32: "s32_to_float"} + + def __init__(self, shape, kernel_size=(3, 3), + ctx=None, devicetype="all", platformid=None, deviceid=None, + block_size=None, profile=False + ): + """Constructor of the OpenCL 2D median filtering class + + :param shape: shape of the images to treat + :param kernel size: 2-tuple of odd values + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param block_size: preferred workgroup size, may vary depending on the outpcome of the compilation + :param profile: switch on profiling to be able to profile at the kernel level, + store profiling elements (makes code slightly slower) + """ + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + block_size=block_size, profile=profile) + self.shape = shape + self.size = self.shape[0] * self.shape[1] + self.kernel_size = self.calc_kernel_size(kernel_size) + self.workgroup_size = (self.calc_wg(self.kernel_size), 1) # 3D kernel + self.buffers = [BufferDescription(i.name, i.size * self.size, i.dtype, i.flags) + for i in self.__class__.buffers] + + self.allocate_buffers() + self.local_mem = self._get_local_mem(self.workgroup_size[0]) + OpenclProcessing.compile_kernels(self, self.kernel_files, "-D NIMAGE=%i" % self.size) + self.set_kernel_arguments() + + def set_kernel_arguments(self): + """Parametrize all kernel arguments + """ + for val in self.mapping.values(): + self.cl_kernel_args[val] = OrderedDict(((i, self.cl_mem[i]) for i in ("image_raw", "image"))) + self.cl_kernel_args["medfilt2d"] = OrderedDict((("image", self.cl_mem["image"]), + ("result", self.cl_mem["result"]), + ("local", self.local_mem), + ("khs1", numpy.int32(self.kernel_size[0] // 2)), # Kernel half-size along dim1 (lines) + ("khs2", numpy.int32(self.kernel_size[1] // 2)), # Kernel half-size along dim2 (columns) + ("height", numpy.int32(self.shape[0])), # Image size along dim1 (lines) + ("width", numpy.int32(self.shape[1])))) +# ('debug', self.cl_mem["debug"]))) # Image size along dim2 (columns)) + + def _get_local_mem(self, wg): + return pyopencl.LocalMemory(wg * 32) # 4byte per float, 8 element per thread + + def send_buffer(self, data, dest): + """Send a numpy array to the device, including the cast on the device if possible + + :param data: numpy array with data + :param dest: name of the buffer as registered in the class + """ + + dest_type = numpy.dtype([i.dtype for i in self.buffers if i.name == dest][0]) + events = [] + if (data.dtype == dest_type) or (data.dtype.itemsize > dest_type.itemsize): + copy_image = pyopencl.enqueue_copy(self.queue, self.cl_mem[dest], numpy.ascontiguousarray(data, dest_type)) + events.append(EventDescription("copy H->D %s" % dest, copy_image)) + else: + copy_image = pyopencl.enqueue_copy(self.queue, self.cl_mem["image_raw"], numpy.ascontiguousarray(data)) + kernel = getattr(self.program, self.mapping[data.dtype.type]) + cast_to_float = kernel(self.queue, (self.size,), None, self.cl_mem["image_raw"], self.cl_mem[dest]) + events += [EventDescription("copy H->D %s" % dest, copy_image), EventDescription("cast to float", cast_to_float)] + if self.profile: + self.events += events + + def calc_wg(self, kernel_size): + """calculate and return the optimal workgroup size for the first dimension, taking into account + the 8-height band + + :param kernel_size: 2-tuple of int, shape of the median window + :return: optimal workgroup size + """ + needed_threads = ((kernel_size[0] + 7) // 8) * kernel_size[1] + if needed_threads < 8: + wg = 8 + elif needed_threads < 32: + wg = 32 + else: + wg = 1 << (int(needed_threads).bit_length()) + return wg + + def medfilt2d(self, image, kernel_size=None): + """Actually apply the median filtering on the image + + :param image: numpy array with the image + :param kernel_size: 2-tuple if + :return: median-filtered 2D image + + + Nota: for window size 1x1 -> 7x7 up to 49 / 64 elements in 8 threads, 8elt/th + 9x9 -> 15x15 up to 225 / 256 elements in 32 threads, 8elt/th + 17x17 -> 21x21 up to 441 / 512 elements in 64 threads, 8elt/th + + TODO: change window size on the fly, + + + """ + events = [] + if kernel_size is None: + kernel_size = self.kernel_size + else: + kernel_size = self.calc_kernel_size(kernel_size) + kernel_half_size = kernel_size // numpy.int32(2) + # this is the workgroup size + wg = self.calc_wg(kernel_size) + + # check for valid work group size: + amws = kernel_workgroup_size(self.program, "medfilt2d") + logger.warning("max actual workgroup size: %s, expected: %s", amws, wg) + if wg > amws: + raise RuntimeError("Workgroup size is too big for medfilt2d: %s>%s" % (wg, amws)) + + localmem = self._get_local_mem(wg) + + assert image.ndim == 2, "Treat only 2D images" + assert image.shape[0] <= self.shape[0], "height is OK" + assert image.shape[1] <= self.shape[1], "width is OK" + + with self.sem: + self.send_buffer(image, "image") + + kwargs = self.cl_kernel_args["medfilt2d"] + kwargs["local"] = localmem + kwargs["khs1"] = kernel_half_size[0] + kwargs["khs2"] = kernel_half_size[1] + kwargs["height"] = numpy.int32(image.shape[0]) + kwargs["width"] = numpy.int32(image.shape[1]) +# for k, v in kwargs.items(): +# print("%s: %s (%s)" % (k, v, type(v))) + mf2d = self.kernels.medfilt2d(self.queue, + (wg, image.shape[1]), + (wg, 1), *list(kwargs.values())) + events.append(EventDescription("median filter 2d", mf2d)) + + result = numpy.empty(image.shape, numpy.float32) + ev = pyopencl.enqueue_copy(self.queue, result, self.cl_mem["result"]) + events.append(EventDescription("copy D->H result", ev)) + ev.wait() + if self.profile: + self.events += events + return result + __call__ = medfilt2d + + @staticmethod + def calc_kernel_size(kernel_size): + """format the kernel size to be a 2-length numpy array of int32 + """ + kernel_size = numpy.asarray(kernel_size, dtype=numpy.int32) + if kernel_size.shape == (): + kernel_size = numpy.repeat(kernel_size.item(), 2).astype(numpy.int32) + for size in kernel_size: + if (size % 2) != 1: + raise ValueError("Each element of kernel_size should be odd.") + return kernel_size + + +class _MedFilt2d(object): + median_filter = None + + @classmethod + def medfilt2d(cls, ary, kernel_size=3): + """Median filter a 2-dimensional array. + + Apply a median filter to the `input` array using a local window-size + given by `kernel_size` (must be odd). + + :param ary: A 2-dimensional input array. + :param kernel_size: A scalar or a list of length 2, giving the size of the + median filter window in each dimension. Elements of + `kernel_size` should be odd. If `kernel_size` is a scalar, + then this scalar is used as the size in each dimension. + Default is a kernel of size (3, 3). + :return: An array the same size as input containing the median filtered + result. always work on float32 values + + About the padding: + + * The filling mode in scipy.signal.medfilt2d is zero-padding + * This implementation is equivalent to: + scipy.ndimage.filters.median_filter(ary, kernel_size, mode="nearest") + + """ + image = numpy.atleast_2d(ary) + shape = numpy.array(image.shape) + if cls.median_filter is None: + cls.median_filter = MedianFilter2D(image.shape, kernel_size) + elif (numpy.array(cls.median_filter.shape) < shape).any(): + # enlarger the buffer size + new_shape = numpy.maximum(numpy.array(cls.median_filter.shape), shape) + ctx = cls.median_filter.ctx + cls.median_filter = MedianFilter2D(new_shape, kernel_size, ctx=ctx) + return cls.median_filter.medfilt2d(image, kernel_size=kernel_size) + +medfilt2d = _MedFilt2d.medfilt2d diff --git a/src/silx/opencl/processing.py b/src/silx/opencl/processing.py new file mode 100644 index 0000000..8b81f7f --- /dev/null +++ b/src/silx/opencl/processing.py @@ -0,0 +1,447 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Project: S I L X project +# https://github.com/silx-kit/silx +# +# Copyright (C) 2012-2018 European Synchrotron Radiation Facility, Grenoble, France +# +# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu) +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +""" +Common OpenCL abstract base classe for different processing +""" + +__author__ = "Jerome Kieffer" +__contact__ = "Jerome.Kieffer@ESRF.eu" +__license__ = "MIT" +__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "02/03/2021" +__status__ = "stable" + +import sys +import os +import logging +import gc +from collections import namedtuple, OrderedDict +import numpy +import threading +from .common import ocl, pyopencl, release_cl_buffers, query_kernel_info, allocate_texture, check_textures_availability +from .utils import concatenate_cl_kernel +import platform + +BufferDescription = namedtuple("BufferDescription", ["name", "size", "dtype", "flags"]) +EventDescription = namedtuple("EventDescription", ["name", "event"]) + +logger = logging.getLogger(__name__) + + +class KernelContainer(object): + """Those object holds a copy of all kernels accessible as attributes""" + + def __init__(self, program): + """Constructor of the class + + :param program: the OpenCL program as generated by PyOpenCL + """ + self._program = program + for kernel in program.all_kernels(): + self.__setattr__(kernel.function_name, kernel) + + def get_kernels(self): + "return the dictionary with all kernels" + return dict(item for item in self.__dict__.items() + if not item[0].startswith("_")) + + def get_kernel(self, name): + "get a kernel from its name" + logger.debug("KernelContainer.get_kernel(%s)", name) + return self.__dict__.get(name) + + def max_workgroup_size(self, kernel_name): + "Retrieve the compile time WORK_GROUP_SIZE for a given kernel" + if isinstance(kernel_name, pyopencl.Kernel): + kernel = kernel_name + else: + kernel = self.get_kernel(kernel_name) + + return query_kernel_info(self._program, kernel, "WORK_GROUP_SIZE") + + def min_workgroup_size(self, kernel_name): + "Retrieve the compile time PREFERRED_WORK_GROUP_SIZE_MULTIPLE for a given kernel" + if isinstance(kernel_name, pyopencl.Kernel): + kernel = kernel_name + else: + kernel = self.get_kernel(kernel_name) + + return query_kernel_info(self._program, kernel, "PREFERRED_WORK_GROUP_SIZE_MULTIPLE") + + +class OpenclProcessing(object): + """Abstract class for different types of OpenCL processing. + + This class provides: + * Generation of the context, queues, profiling mode + * Additional function to allocate/free all buffers declared as static attributes of the class + * Functions to compile kernels, cache them and clean them + * helper functions to clone the object + """ + # Example of how to create an output buffer of 10 floats + buffers = [BufferDescription("output", 10, numpy.float32, None), + ] + # list of kernel source files to be concatenated before compilation of the program + kernel_files = [] + + def __init__(self, ctx=None, devicetype="all", platformid=None, deviceid=None, + block_size=None, memory=None, profile=False): + """Constructor of the abstract OpenCL processing class + + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param block_size: preferred workgroup size, may vary depending on the + out come of the compilation + :param memory: minimum memory available on device + :param profile: switch on profiling to be able to profile at the kernel + level, store profiling elements (makes code slightly slower) + """ + self.sem = threading.Semaphore() + self._X87_VOLATILE = None + self.profile = None + self.events = [] # List with of EventDescription, kept for profiling + self.cl_mem = {} # dict with all buffer allocated + self.cl_program = None # The actual OpenCL program + self.cl_kernel_args = {} # dict with all kernel arguments + self.queue = None + if ctx: + self.ctx = ctx + else: + self.ctx = ocl.create_context(devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + memory=memory) + device_name = self.ctx.devices[0].name.strip() + platform_name = self.ctx.devices[0].platform.name.strip() + platform = ocl.get_platform(platform_name) + self.device = platform.get_device(device_name) + self.cl_kernel_args = {} # dict with all kernel arguments + + self.set_profiling(profile) + self.block_size = block_size + self.program = None + self.kernels = None + + def check_textures_availability(self): + return check_textures_availability(self.ctx) + + def __del__(self): + """Destructor: release all buffers and programs + """ + try: + self.reset_log() + self.free_kernels() + self.free_buffers() + if self.queue is not None: + self.queue.finish() + except Exception as err: + logger.warning("%s: %s", type(err), err) + self.queue = None + self.device = None + self.ctx = None + gc.collect() + + def allocate_buffers(self, buffers=None, use_array=False): + """ + Allocate OpenCL buffers required for a specific configuration + + :param buffers: a list of BufferDescriptions, leave to None for + paramatrized buffers. + :param use_array: allocate memory as pyopencl.array.Array + instead of pyopencl.Buffer + + Note that an OpenCL context also requires some memory, as well + as Event and other OpenCL functionalities which cannot and are + not taken into account here. The memory required by a context + varies depending on the device. Typical for GTX580 is 65Mb but + for a 9300m is ~15Mb In addition, a GPU will always have at + least 3-5Mb of memory in use. Unfortunately, OpenCL does NOT + have a built-in way to check the actual free memory on a + device, only the total memory. + """ + if buffers is None: + buffers = self.buffers + + with self.sem: + mem = {} + + # check if enough memory is available on the device + ualloc = 0 + for buf in buffers: + ualloc += numpy.dtype(buf.dtype).itemsize * numpy.prod(buf.size) + logger.info("%.3fMB are needed on device: %s, which has %.3fMB", + ualloc / 1.0e6, self.device, self.device.memory / 1.0e6) + + if ualloc >= self.device.memory: + raise MemoryError("Fatal error in allocate_buffers. Not enough " + " device memory for buffers (%lu requested, %lu available)" + % (ualloc, self.device.memory)) + + # do the allocation + try: + if use_array: + for buf in buffers: + mem[buf.name] = pyopencl.array.empty(self.queue, buf.size, buf.dtype) + else: + for buf in buffers: + size = numpy.dtype(buf.dtype).itemsize * numpy.prod(buf.size) + mem[buf.name] = pyopencl.Buffer(self.ctx, buf.flags, int(size)) + except pyopencl.MemoryError as error: + release_cl_buffers(mem) + raise MemoryError(error) + + self.cl_mem.update(mem) + + def add_to_cl_mem(self, parrays): + """ + Add pyopencl.array, which are allocated by pyopencl, to self.cl_mem. + This should be used before calling allocate_buffers(). + + :param parrays: a dictionary of `pyopencl.array.Array` or `pyopencl.Buffer` + """ + mem = self.cl_mem + for name, parr in parrays.items(): + mem[name] = parr + self.cl_mem.update(mem) + + def check_workgroup_size(self, kernel_name): + "Calculate the maximum workgroup size from given kernel after compilation" + return self.kernels.max_workgroup_size(kernel_name) + + def free_buffers(self): + """free all device.memory allocated on the device + """ + with self.sem: + for key, buf in list(self.cl_mem.items()): + if buf is not None: + if isinstance(buf, pyopencl.array.Array): + try: + buf.data.release() + except pyopencl.LogicError: + logger.error("Error while freeing buffer %s", key) + else: + try: + buf.release() + except pyopencl.LogicError: + logger.error("Error while freeing buffer %s", key) + self.cl_mem[key] = None + + def compile_kernels(self, kernel_files=None, compile_options=None): + """Call the OpenCL compiler + + :param kernel_files: list of path to the kernel + (by default use the one declared in the class) + :param compile_options: string of compile options + """ + # concatenate all needed source files into a single openCL module + kernel_files = kernel_files or self.kernel_files + kernel_src = concatenate_cl_kernel(kernel_files) + + compile_options = compile_options or self.get_compiler_options() + logger.info("Compiling file %s with options %s", kernel_files, compile_options) + try: + self.program = pyopencl.Program(self.ctx, kernel_src).build(options=compile_options) + except (pyopencl.MemoryError, pyopencl.LogicError) as error: + raise MemoryError(error) + else: + self.kernels = KernelContainer(self.program) + + def free_kernels(self): + """Free all kernels + """ + for kernel in self.cl_kernel_args: + self.cl_kernel_args[kernel] = [] + self.kernels = None + self.program = None + + def set_profiling(self, value=True): + """Switch On/Off the profiling flag of the command queue to allow debugging + + :param value: set to True to enable profiling, or to False to disable it. + Without profiling, the processing is marginally faster + + Profiling information can then be retrieved with the 'log_profile' method + """ + if bool(value) != self.profile: + with self.sem: + self.profile = bool(value) + if self.queue is not None: + self.queue.finish() + if self.profile: + self.queue = pyopencl.CommandQueue(self.ctx, + properties=pyopencl.command_queue_properties.PROFILING_ENABLE) + else: + self.queue = pyopencl.CommandQueue(self.ctx) + + def profile_add(self, event, desc): + """ + Add an OpenCL event to the events lists, if profiling is enabled. + + :param event: silx.opencl.processing.EventDescription. + :param desc: event description + """ + if self.profile: + self.events.append(EventDescription(desc, event)) + + def allocate_texture(self, shape, hostbuf=None, support_1D=False): + return allocate_texture(self.ctx, shape, hostbuf=hostbuf, support_1D=support_1D) + + def transfer_to_texture(self, arr, tex_ref): + """ + Transfer an array to a texture. + + :param arr: Input array. Can be a numpy array or a pyopencl array. + :param tex_ref: texture reference (pyopencl._cl.Image). + """ + copy_args = [self.queue, tex_ref, arr] + shp = arr.shape + ndim = arr.ndim + if ndim == 1: + # pyopencl and OpenCL < 1.2 do not support image1d_t + # force 2D with one row in this case + # ~ ndim = 2 + shp = (1,) + shp + copy_kwargs = {"origin":(0,) * ndim, "region": shp[::-1]} + if not(isinstance(arr, numpy.ndarray)): # assuming pyopencl.array.Array + # D->D copy + copy_args[2] = arr.data + copy_kwargs["offset"] = 0 + ev = pyopencl.enqueue_copy(*copy_args, **copy_kwargs) + self.profile_add(ev, "Transfer to texture") + + def log_profile(self, stats=False): + """If we are in profiling mode, prints out all timing for every single OpenCL call + + :param stats: if True, prints the statistics on each kernel instead of all execution timings + :return: list of lines to print + """ + total_time = 0.0 + out = [""] + if stats: + stats = OrderedDict() + out.append(f"OpenCL kernel profiling statistics in milliseconds for: {self.__class__.__name__}") + out.append(f"{'Kernel name':>50} (count): min median max mean std") + else: + stats = None + out.append(f"Profiling info for OpenCL: {self.__class__.__name__}") + + if self.profile: + for e in self.events: + if "__len__" in dir(e) and len(e) >= 2: + name = e[0] + pr = e[1].profile + t0 = pr.start + t1 = pr.end + et = 1e-6 * (t1 - t0) + total_time += et + if stats is None: + out.append(f"{name:>50} : {et:.3f}ms") + else: + if name in stats: + stats[name].append(et) + else: + stats[name] = [et] + if stats is not None: + for k, v in stats.items(): + n = numpy.array(v) + out.append(f"{k:>50} ({len(v):5}): {n.min():8.3f} {numpy.median(n):8.3f} {n.max():8.3f} {n.mean():8.3f} {n.std():8.3f}") + out.append("_" * 80) + out.append(f"{'Total OpenCL execution time':>50} : {total_time:.3f}ms") + + logger.info(os.linesep.join(out)) + return out + + def reset_log(self): + """ + Resets the profiling timers + """ + with self.sem: + self.events = [] + + @property + def x87_volatile_option(self): + # this is running 32 bits OpenCL woth POCL + if self._X87_VOLATILE is None: + if (platform.machine() in ("i386", "i686", "x86_64", "AMD64") and + (tuple.__itemsize__ == 4) and + self.ctx.devices[0].platform.name == 'Portable Computing Language'): + self._X87_VOLATILE = "-DX87_VOLATILE=volatile" + else: + self._X87_VOLATILE = "" + return self._X87_VOLATILE + + def get_compiler_options(self, x87_volatile=False): + """Provide the default OpenCL compiler options + + :param x87_volatile: needed for Kahan summation + :return: string with compiler option + """ + option_list = [] + if x87_volatile: + option_list.append(self.x87_volatile_option) + return " ".join(i for i in option_list if i) + +# This should be implemented by concrete class +# def __copy__(self): +# """Shallow copy of the object +# +# :return: copy of the object +# """ +# return self.__class__((self._data, self._indices, self._indptr), +# self.size, block_size=self.BLOCK_SIZE, +# platformid=self.platform.id, +# deviceid=self.device.id, +# checksum=self.on_device.get("data"), +# profile=self.profile, empty=self.empty) +# +# def __deepcopy__(self, memo=None): +# """deep copy of the object +# +# :return: deepcopy of the object +# """ +# if memo is None: +# memo = {} +# new_csr = self._data.copy(), self._indices.copy(), self._indptr.copy() +# memo[id(self._data)] = new_csr[0] +# memo[id(self._indices)] = new_csr[1] +# memo[id(self._indptr)] = new_csr[2] +# new_obj = self.__class__(new_csr, self.size, +# block_size=self.BLOCK_SIZE, +# platformid=self.platform.id, +# deviceid=self.device.id, +# checksum=self.on_device.get("data"), +# profile=self.profile, empty=self.empty) +# memo[id(self)] = new_obj +# return new_obj diff --git a/src/silx/opencl/projection.py b/src/silx/opencl/projection.py new file mode 100644 index 0000000..c02faf6 --- /dev/null +++ b/src/silx/opencl/projection.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2016-2020 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Module for tomographic projector on the GPU""" + +from __future__ import absolute_import, print_function, with_statement, division + +__authors__ = ["A. Mirone, P. Paleo"] +__license__ = "MIT" +__date__ = "01/08/2019" + +import logging +import numpy as np + +from .common import pyopencl +from .processing import EventDescription, OpenclProcessing, BufferDescription +from .backprojection import _sizeof, _idivup + +if pyopencl: + mf = pyopencl.mem_flags + import pyopencl.array as parray +else: + raise ImportError("pyopencl is not installed") +logger = logging.getLogger(__name__) + + +class Projection(OpenclProcessing): + """ + A class for performing a tomographic projection (Radon Transform) using + OpenCL + """ + kernel_files = ["proj.cl", "array_utils.cl"] + logger.warning("Forward Projecter is untested and unsuported for now") + + def __init__(self, slice_shape, angles, axis_position=None, + detector_width=None, normalize=False, ctx=None, + devicetype="all", platformid=None, deviceid=None, + profile=False + ): + """Constructor of the OpenCL projector. + + :param slice_shape: shape of the slice: (num_rows, num_columns). + :param angles: Either an integer number of angles, or a list of custom + angles values in radian. + :param axis_position: Optional, axis position. Default is + `(shape[1]-1)/2.0`. + :param detector_width: Optional, detector width in pixels. + If detector_width > slice_shape[1], the + projection data will be surrounded with zeros. + Using detector_width < slice_shape[1] might + result in a local tomography setup. + :param normalize: Optional, normalization. If set, the sinograms are + multiplied by the factor pi/(2*nprojs). + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by + clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param profile: switch on profiling to be able to profile at the kernel + level, store profiling elements (makes code slightly + slower) + """ + # OS X enforces a workgroup size of 1 when the kernel has synchronization barriers + # if sys.platform.startswith('darwin'): # assuming no discrete GPU + # raise NotImplementedError("Backprojection is not implemented on CPU for OS X yet") + + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + profile=profile) + self.shape = slice_shape + self.axis_pos = axis_position + self.angles = angles + self.dwidth = detector_width + self.normalize = normalize + + # Default values + if self.axis_pos is None: + self.axis_pos = (self.shape[1] - 1) / 2. + if self.dwidth is None: + self.dwidth = self.shape[1] + if not(np.iterable(self.angles)): + if self.angles is None: + self.nprojs = self.shape[0] + else: + self.nprojs = self.angles + self.angles = np.linspace(start=0, + stop=np.pi, + num=self.nprojs, + endpoint=False).astype(dtype=np.float32) + else: + self.nprojs = len(self.angles) + self.offset_x = -np.float32((self.shape[1] - 1) / 2. - self.axis_pos) # TODO: custom + self.offset_y = -np.float32((self.shape[0] - 1) / 2. - self.axis_pos) # TODO: custom + # Reset axis_pos once offset are computed + self.axis_pos0 = np.float64((self.shape[1] - 1) / 2.) + + # Workgroup, ndrange and shared size + self.dimgrid_x = _idivup(self.dwidth, 16) + self.dimgrid_y = _idivup(self.nprojs, 16) + self._dimrecx = np.int32(self.dimgrid_x * 16) + self._dimrecy = np.int32(self.dimgrid_y * 16) + self.local_mem = 16 * 7 * _sizeof(np.float32) + self.wg = (16, 16) + self.ndrange = ( + int(self.dimgrid_x) * self.wg[0], # int(): pyopencl <= 2015.1 + int(self.dimgrid_y) * self.wg[1] # int(): pyopencl <= 2015.1 + ) + + self._use_textures = self.check_textures_availability() + + # Allocate memory + self.buffers = [ + BufferDescription("_d_sino", self._dimrecx * self._dimrecy, np.float32, mf.READ_WRITE), + BufferDescription("d_angles", self._dimrecy, np.float32, mf.READ_ONLY), + BufferDescription("d_beginPos", self._dimrecy * 2, np.int32, mf.READ_ONLY), + BufferDescription("d_strideJoseph", self._dimrecy * 2, np.int32, mf.READ_ONLY), + BufferDescription("d_strideLine", self._dimrecy * 2, np.int32, mf.READ_ONLY), + ] + d_axis_corrections = parray.empty(self.queue, self.nprojs, np.float32) + d_axis_corrections.fill(np.float32(0.0)) + self.add_to_cl_mem( + { + "d_axis_corrections": d_axis_corrections + } + ) + self._tmp_extended_img = np.zeros((self.shape[0] + 2, self.shape[1] + 2), + dtype=np.float32) + if not(self._use_textures): + self.allocate_slice() + else: + self.allocate_textures() + self.allocate_buffers() + self._ex_sino = np.zeros((self._dimrecy, self._dimrecx), + dtype=np.float32) + if not(self._use_textures): + self.cl_mem["d_slice"].fill(0.) + # enqueue_fill_buffer has issues if opencl 1.2 is not present + # ~ pyopencl.enqueue_fill_buffer( + # ~ self.queue, + # ~ self.cl_mem["d_slice"], + # ~ np.float32(0), + # ~ 0, + # ~ self._tmp_extended_img.size * _sizeof(np.float32) + # ~ ) + # Precomputations + self.compute_angles() + self.proj_precomputations() + self.cl_mem["d_axis_corrections"].fill(0.) + # enqueue_fill_buffer has issues if opencl 1.2 is not present + # ~ pyopencl.enqueue_fill_buffer( + # ~ self.queue, + # ~ self.cl_mem["d_axis_corrections"], + # ~ np.float32(0), + # ~ 0, + # ~ self.nprojs*_sizeof(np.float32) + # ~ ) + # Shorthands + self._d_sino = self.cl_mem["_d_sino"] + + compile_options = None + if not(self._use_textures): + compile_options = "-DDONT_USE_TEXTURES" + OpenclProcessing.compile_kernels( + self, + self.kernel_files, + compile_options=compile_options + ) + # check that workgroup can actually be (16, 16) + self.compiletime_workgroup_size = self.kernels.max_workgroup_size("forward_kernel_cpu") + + def compute_angles(self): + angles2 = np.zeros(self._dimrecy, dtype=np.float32) # dimrecy != num_projs + angles2[:self.nprojs] = np.copy(self.angles) + angles2[self.nprojs:] = angles2[self.nprojs - 1] + self.angles2 = angles2 + pyopencl.enqueue_copy(self.queue, self.cl_mem["d_angles"], angles2) + + def allocate_slice(self): + ary = parray.empty(self.queue, (self.shape[1] + 2, self.shape[1] + 2), np.float32) + ary.fill(0) + self.add_to_cl_mem({"d_slice": ary}) + + def allocate_textures(self): + self.d_image_tex = pyopencl.Image( + self.ctx, + mf.READ_ONLY | mf.USE_HOST_PTR, + pyopencl.ImageFormat( + pyopencl.channel_order.INTENSITY, + pyopencl.channel_type.FLOAT + ), hostbuf=np.ascontiguousarray(self._tmp_extended_img.T), + ) + + def transfer_to_texture(self, image): + image2 = image + if not(image.flags["C_CONTIGUOUS"] and image.dtype == np.float32): + image2 = np.ascontiguousarray(image) + if not(self._use_textures): + # TODO: create NoneEvent + return self.transfer_to_slice(image2) + # ~ return pyopencl.enqueue_copy( + # ~ self.queue, + # ~ self.cl_mem["d_slice"].data, + # ~ image2, + # ~ origin=(1, 1), + # ~ region=image.shape[::-1] + # ~ ) + else: + return pyopencl.enqueue_copy( + self.queue, + self.d_image_tex, + image2, + origin=(1, 1), + region=image.shape[::-1] + ) + + def transfer_device_to_texture(self, d_image): + if not(self._use_textures): + # TODO this copy should not be necessary + return self.cpy2d_to_slice(d_image) + else: + return pyopencl.enqueue_copy( + self.queue, + self.d_image_tex, + d_image, + offset=0, + origin=(1, 1), + region=(int(self.shape[1]), int(self.shape[0])) # self.shape[::-1] # pyopencl <= 2015.2 + ) + + def transfer_to_slice(self, image): + image2 = np.zeros((image.shape[0] + 2, image.shape[1] + 2), dtype=np.float32) + image2[1:-1, 1:-1] = image.astype(np.float32) + self.cl_mem["d_slice"].set(image2) + + def proj_precomputations(self): + beginPos = np.zeros((2, self._dimrecy), dtype=np.int32) + strideJoseph = np.zeros((2, self._dimrecy), dtype=np.int32) + strideLine = np.zeros((2, self._dimrecy), dtype=np.int32) + cos_angles = np.cos(self.angles2) + sin_angles = np.sin(self.angles2) + dimslice = self.shape[1] + + M1 = np.abs(cos_angles) > 0.70710678 + M1b = np.logical_not(M1) + M2 = cos_angles > 0 + M2b = np.logical_not(M2) + M3 = sin_angles > 0 + M3b = np.logical_not(M3) + case1 = M1 * M2 + case2 = M1 * M2b + case3 = M1b * M3 + case4 = M1b * M3b + + beginPos[0][case1] = 0 + beginPos[1][case1] = 0 + strideJoseph[0][case1] = 1 + strideJoseph[1][case1] = 0 + strideLine[0][case1] = 0 + strideLine[1][case1] = 1 + + beginPos[0][case2] = dimslice - 1 + beginPos[1][case2] = dimslice - 1 + strideJoseph[0][case2] = -1 + strideJoseph[1][case2] = 0 + strideLine[0][case2] = 0 + strideLine[1][case2] = -1 + + beginPos[0][case3] = dimslice - 1 + beginPos[1][case3] = 0 + strideJoseph[0][case3] = 0 + strideJoseph[1][case3] = 1 + strideLine[0][case3] = -1 + strideLine[1][case3] = 0 + + beginPos[0][case4] = 0 + beginPos[1][case4] = dimslice - 1 + strideJoseph[0][case4] = 0 + strideJoseph[1][case4] = -1 + strideLine[0][case4] = 1 + strideLine[1][case4] = 0 + + # For debug purpose + # ~ self.beginPos = beginPos + # ~ self.strideJoseph = strideJoseph + # ~ self.strideLine = strideLine + # + + pyopencl.enqueue_copy(self.queue, self.cl_mem["d_beginPos"], beginPos) + pyopencl.enqueue_copy(self.queue, self.cl_mem["d_strideJoseph"], strideJoseph) + pyopencl.enqueue_copy(self.queue, self.cl_mem["d_strideLine"], strideLine) + + def _get_local_mem(self): + return pyopencl.LocalMemory(self.local_mem) # constant for all image sizes + + def cpy2d_to_sino(self, dst): + ndrange = (int(self.dwidth), int(self.nprojs)) # pyopencl < 2015.2 + sino_shape_ocl = np.int32(ndrange) + wg = None + kernel_args = ( + dst.data, + self._d_sino, + np.int32(self.dwidth), + np.int32(self._dimrecx), + np.int32((0, 0)), + np.int32((0, 0)), + sino_shape_ocl + ) + return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args) + + def cpy2d_to_slice(self, src): + """ + copy a Nx * Ny slice to self.d_slice which is (Nx+2)*(Ny+2) + """ + ndrange = (int(self.shape[1]), int(self.shape[0])) # self.shape[::-1] # pyopencl < 2015.2 + wg = None + slice_shape_ocl = np.int32(ndrange) + kernel_args = ( + self.cl_mem["d_slice"].data, + src, + np.int32(self.shape[1] + 2), + np.int32(self.shape[1]), + np.int32((1, 1)), + np.int32((0, 0)), + slice_shape_ocl + ) + return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args) + + def projection(self, image=None, dst=None): + """Perform the projection on an input image + + :param image: Image to project + :return: A sinogram + """ + events = [] + with self.sem: + if image is not None: + assert image.ndim == 2, "Treat only 2D images" + assert image.shape[0] == self.shape[0], "image shape is OK" + assert image.shape[1] == self.shape[1], "image shape is OK" + if self._use_textures: + self.transfer_to_texture(image) + slice_ref = self.d_image_tex + else: + self.transfer_to_slice(image) + slice_ref = self.cl_mem["d_slice"].data + else: + if not(self._use_textures): + slice_ref = self.cl_mem["d_slice"].data + else: + slice_ref = self.d_image_tex + + kernel_args = ( + self._d_sino, + slice_ref, + np.int32(self.shape[1]), + np.int32(self.dwidth), + self.cl_mem["d_angles"], + np.float32(self.axis_pos0), + self.cl_mem["d_axis_corrections"].data, # TODO custom + self.cl_mem["d_beginPos"], + self.cl_mem["d_strideJoseph"], + self.cl_mem["d_strideLine"], + np.int32(self.nprojs), + self._dimrecx, + self._dimrecy, + self.offset_x, + self.offset_y, + np.int32(1), # josephnoclip, 1 by default + np.int32(self.normalize) + ) + + # Call the kernel + if not(self._use_textures): + event_pj = self.kernels.forward_kernel_cpu( + self.queue, + self.ndrange, + self.wg, + *kernel_args + ) + else: + event_pj = self.kernels.forward_kernel( + self.queue, + self.ndrange, + self.wg, + *kernel_args + ) + events.append(EventDescription("projection", event_pj)) + if dst is None: + self._ex_sino[:] = 0 + ev = pyopencl.enqueue_copy(self.queue, self._ex_sino, self._d_sino) + events.append(EventDescription("copy D->H result", ev)) + ev.wait() + res = np.copy(self._ex_sino[:self.nprojs, :self.dwidth]) + else: + ev = self.cpy2d_to_sino(dst) + events.append(EventDescription("copy D->D result", ev)) + ev.wait() + res = dst + # /with self.sem + if self.profile: + self.events += events + # ~ res = self._ex_sino + return res + + __call__ = projection diff --git a/src/silx/opencl/reconstruction.py b/src/silx/opencl/reconstruction.py new file mode 100644 index 0000000..2c84aee --- /dev/null +++ b/src/silx/opencl/reconstruction.py @@ -0,0 +1,388 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2016 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Module for tomographic reconstruction algorithms""" + +from __future__ import absolute_import, print_function, with_statement, division + +__authors__ = ["P. Paleo"] +__license__ = "MIT" +__date__ = "01/08/2019" + +import logging +import numpy as np + +from .common import pyopencl +from .processing import OpenclProcessing +from .backprojection import Backprojection +from .projection import Projection +from .linalg import LinAlg + +import pyopencl.array as parray +from pyopencl.elementwise import ElementwiseKernel +logger = logging.getLogger(__name__) + +cl = pyopencl + + +class ReconstructionAlgorithm(OpenclProcessing): + """ + A parent class for all iterative tomographic reconstruction algorithms + + :param sino_shape: shape of the sinogram. The sinogram is in the format + (n_b, n_a) where n_b is the number of detector bins and + n_a is the number of angles. + :param slice_shape: Optional, shape of the reconstructed slice. + By default, it is a square slice where the dimension + is the "x dimension" of the sinogram (number of bins). + :param axis_position: Optional, axis position. Default is `(shape[1]-1)/2.0`. + :param angles: Optional, a list of custom angles in radian. + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param profile: switch on profiling to be able to profile at the kernel level, + store profiling elements (makes code slightly slower) + """ + + def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, + ctx=None, devicetype="all", platformid=None, deviceid=None, + profile=False + ): + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + profile=profile) + + # Create a backprojector + self.backprojector = Backprojection( + sino_shape, + slice_shape=slice_shape, + axis_position=axis_position, + angles=angles, + ctx=self.ctx, + profile=profile + ) + # Create a projector + self.projector = Projection( + self.backprojector.slice_shape, + self.backprojector.angles, + axis_position=axis_position, + detector_width=self.backprojector.num_bins, + normalize=False, + ctx=self.ctx, + profile=profile + ) + self.sino_shape = sino_shape + self.is_cpu = self.backprojector.is_cpu + # Arrays + self.d_data = parray.empty(self.queue, sino_shape, dtype=np.float32) + self.d_data.fill(0.0) + self.d_sino = parray.empty_like(self.d_data) + self.d_sino.fill(0.0) + self.d_x = parray.empty(self.queue, + self.backprojector.slice_shape, + dtype=np.float32) + self.d_x.fill(0.0) + self.d_x_old = parray.empty_like(self.d_x) + self.d_x_old.fill(0.0) + + self.add_to_cl_mem({ + "d_data": self.d_data, + "d_sino": self.d_sino, + "d_x": self.d_x, + "d_x_old": self.d_x_old, + }) + + def proj(self, d_slice, d_sino): + """ + Project d_slice to d_sino + """ + self.projector.transfer_device_to_texture(d_slice.data) #.wait() + self.projector.projection(dst=d_sino) + + def backproj(self, d_sino, d_slice): + """ + Backproject d_sino to d_slice + """ + self.backprojector.transfer_device_to_texture(d_sino.data) #.wait() + self.backprojector.backprojection(dst=d_slice) + + +class SIRT(ReconstructionAlgorithm): + """ + A class for the SIRT algorithm + + :param sino_shape: shape of the sinogram. The sinogram is in the format + (n_b, n_a) where n_b is the number of detector bins and + n_a is the number of angles. + :param slice_shape: Optional, shape of the reconstructed slice. + By default, it is a square slice where the dimension is + the "x dimension" of the sinogram (number of bins). + :param axis_position: Optional, axis position. Default is `(shape[1]-1)/2.0`. + :param angles: Optional, a list of custom angles in radian. + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param profile: switch on profiling to be able to profile at the kernel level, + store profiling elements (makes code slightly slower) + + .. warning:: This is a beta version of the SIRT algorithm. Reconstruction + fails for at least on CPU (Xeon E3-1245 v5) using the AMD opencl + implementation. + """ + + def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, + ctx=None, devicetype="all", platformid=None, deviceid=None, + profile=False + ): + + ReconstructionAlgorithm.__init__(self, sino_shape, slice_shape=slice_shape, + axis_position=axis_position, angles=angles, + ctx=ctx, devicetype=devicetype, platformid=platformid, + deviceid=deviceid, profile=profile) + self.compute_preconditioners() + + def compute_preconditioners(self): + """ + Create a diagonal preconditioner for the projection and backprojection + operator. + Each term of the diagonal is the sum of the projector/backprojector + along rows [1], i.e the projection/backprojection of an array of ones. + + [1] Jens Gregor and Thomas Benson, + Computational Analysis and Improvement of SIRT, + IEEE transactions on medical imaging, vol. 27, no. 7, 2008 + """ + + # r_{i,i} = 1/(sum_j a_{i,j}) + slice_ones = np.ones(self.backprojector.slice_shape, dtype=np.float32) + R = 1./self.projector.projection(slice_ones) # could be all done on GPU, but I want extra checks + R[np.logical_not(np.isfinite(R))] = 1. # In the case where the rotation axis is excentred + self.d_R = parray.to_device(self.queue, R) + # c_{j,j} = 1/(sum_i a_{i,j}) + sino_ones = np.ones(self.sino_shape, dtype=np.float32) + C = 1./self.backprojector.backprojection(sino_ones) + C[np.logical_not(np.isfinite(C))] = 1. # In the case where the rotation axis is excentred + self.d_C = parray.to_device(self.queue, C) + + self.add_to_cl_mem({ + "d_R": self.d_R, + "d_C": self.d_C + }) + + # TODO: compute and possibly return the residual + def run(self, data, n_it): + """ + Run n_it iterations of the SIRT algorithm. + """ + cl.enqueue_copy(self.queue, self.d_data.data, np.ascontiguousarray(data.astype(np.float32))) + + d_x_old = self.d_x_old + d_x = self.d_x + d_R = self.d_R + d_C = self.d_C + d_sino = self.d_sino + d_x *= 0 + + for k in range(n_it): + d_x_old[:] = d_x[:] + # x{k+1} = x{k} - C A^T R (A x{k} - b) + self.proj(d_x, d_sino) + d_sino -= self.d_data + d_sino *= d_R + if self.is_cpu: + # This sync is necessary when using CPU, while it is not for GPU + d_sino.finish() + self.backproj(d_sino, d_x) + d_x *= -d_C + d_x += d_x_old + if self.is_cpu: + # This sync is necessary when using CPU, while it is not for GPU + d_x.finish() + + return d_x + + __call__ = run + + +class TV(ReconstructionAlgorithm): + """ + A class for reconstruction with Total Variation regularization using the + Chambolle-Pock TV reconstruction algorithm. + + :param sino_shape: shape of the sinogram. The sinogram is in the format + (n_b, n_a) where n_b is the number of detector bins and + n_a is the number of angles. + :param slice_shape: Optional, shape of the reconstructed slice. By default, + it is a square slice where the dimension is the + "x dimension" of the sinogram (number of bins). + :param axis_position: Optional, axis position. Default is + `(shape[1]-1)/2.0`. + :param angles: Optional, a list of custom angles in radian. + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param profile: switch on profiling to be able to profile at the kernel + level, store profiling elements (makes code slightly slower) + + .. warning:: This is a beta version of the Chambolle-Pock TV algorithm. + Reconstruction fails for at least on CPU (Xeon E3-1245 v5) using + the AMD opencl implementation. + """ + + def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, + ctx=None, devicetype="all", platformid=None, deviceid=None, + profile=False + ): + ReconstructionAlgorithm.__init__(self, sino_shape, slice_shape=slice_shape, + axis_position=axis_position, angles=angles, + ctx=ctx, devicetype=devicetype, platformid=platformid, + deviceid=deviceid, profile=profile) + self.compute_preconditioners() + + # Create a LinAlg instance + self.linalg = LinAlg(self.backprojector.slice_shape, ctx=self.ctx) + # Positivity constraint + self.elwise_clamp = ElementwiseKernel(self.ctx, "float *a", "a[i] = max(a[i], 0.0f);") + # Projection onto the L-infinity ball of radius Lambda + self.elwise_proj_linf = ElementwiseKernel( + self.ctx, + "float2* a, float Lambda", + "a[i].x = copysign(min(fabs(a[i].x), Lambda), a[i].x); a[i].y = copysign(min(fabs(a[i].y), Lambda), a[i].y);", + "elwise_proj_linf" + ) + # Additional arrays + self.linalg.gradient(self.d_x) + self.d_p = parray.empty_like(self.linalg.cl_mem["d_gradient"]) + self.d_q = parray.empty_like(self.d_data) + self.d_g = self.linalg.d_image + self.d_tmp = parray.empty_like(self.d_x) + self.d_p.fill(0) + self.d_q.fill(0) + self.d_tmp.fill(0) + self.add_to_cl_mem({ + "d_p": self.d_p, + "d_q": self.d_q, + "d_tmp": self.d_tmp, + }) + + self.theta = 1.0 + + def compute_preconditioners(self): + """ + Create a diagonal preconditioner for the projection and backprojection + operator. + Each term of the diagonal is the sum of the projector/backprojector + along rows [2], + i.e the projection/backprojection of an array of ones. + + [2] T. Pock, A. Chambolle, + Diagonal preconditioning for first order primal-dual algorithms in + convex optimization, + International Conference on Computer Vision, 2011 + """ + + # Compute the diagonal preconditioner "Sigma" + slice_ones = np.ones(self.backprojector.slice_shape, dtype=np.float32) + Sigma_k = 1./self.projector.projection(slice_ones) + Sigma_k[np.logical_not(np.isfinite(Sigma_k))] = 1. + self.d_Sigma_k = parray.to_device(self.queue, Sigma_k) + self.d_Sigma_kp1 = self.d_Sigma_k + 1 # TODO: memory vs computation + self.Sigma_grad = 1/2.0 # For discrete gradient, sum|D_i,j| = 2 along lines or cols + + # Compute the diagonal preconditioner "Tau" + sino_ones = np.ones(self.sino_shape, dtype=np.float32) + C = self.backprojector.backprojection(sino_ones) + Tau = 1./(C + 2.) + self.d_Tau = parray.to_device(self.queue, Tau) + + self.add_to_cl_mem({ + "d_Sigma_k": self.d_Sigma_k, + "d_Sigma_kp1": self.d_Sigma_kp1, + "d_Tau": self.d_Tau + }) + + def run(self, data, n_it, Lambda, pos_constraint=False): + """ + Run n_it iterations of the TV-regularized reconstruction, + with the regularization parameter Lambda. + """ + cl.enqueue_copy(self.queue, self.d_data.data, np.ascontiguousarray(data.astype(np.float32))) + + d_x = self.d_x + d_x_old = self.d_x_old + d_tmp = self.d_tmp + d_sino = self.d_sino + d_p = self.d_p + d_q = self.d_q + d_g = self.d_g + + d_x *= 0 + d_p *= 0 + d_q *= 0 + + for k in range(0, n_it): + # Update primal variables + d_x_old[:] = d_x[:] + #~ x = x + Tau*div(p) - Tau*Kadj(q) + self.backproj(d_q, d_tmp) + self.linalg.divergence(d_p) + # TODO: this in less than three ops (one kernel ?) + d_g -= d_tmp # d_g -> L.d_image + d_g *= self.d_Tau + d_x += d_g + + if pos_constraint: + self.elwise_clamp(d_x) + + # Update dual variables + #~ p = proj_linf(p + Sigma_grad*gradient(x + theta*(x - x_old)), Lambda) + d_tmp[:] = d_x[:] + # FIXME: mul_add is out of place, put an equivalent thing in linalg... + #~ d_tmp.mul_add(1 + theta, d_x_old, -theta) + d_tmp *= 1+self.theta + d_tmp -= self.theta*d_x_old + self.linalg.gradient(d_tmp) + # TODO: out of place mul_add + #~ d_p.mul_add(1, L.cl_mem["d_gradient"], Sigma_grad) + self.linalg.cl_mem["d_gradient"] *= self.Sigma_grad + d_p += self.linalg.cl_mem["d_gradient"] + self.elwise_proj_linf(d_p, Lambda) + + #~ q = (q + Sigma_k*K(x + theta*(x - x_old)) - Sigma_k*data)/(1.0 + Sigma_k) + self.proj(d_tmp, d_sino) + # TODO: this in less instructions + d_sino -= self.d_data + d_sino *= self.d_Sigma_k + d_q += d_sino + d_q /= self.d_Sigma_kp1 + return d_x + + __call__ = run diff --git a/src/silx/opencl/setup.py b/src/silx/opencl/setup.py new file mode 100644 index 0000000..10fb1be --- /dev/null +++ b/src/silx/opencl/setup.py @@ -0,0 +1,48 @@ +# coding: utf-8 +# +# Copyright (C) 2016-2017 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + +from __future__ import division + +__contact__ = "jerome.kieffer@esrf.eu" +__license__ = "MIT" +__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France" +__authors__ = ["J. Kieffer"] +__date__ = "16/10/2017" + +import os.path +from numpy.distutils.misc_util import Configuration + + +def configuration(parent_package='', top_path=None): + config = Configuration('opencl', parent_package, top_path) + path = os.path.dirname(os.path.abspath(__file__)) + if os.path.exists(os.path.join(path, 'sift')): + config.add_subpackage('sift') + config.add_subpackage('codec') + config.add_subpackage('test') + return config + + +if __name__ == "__main__": + from numpy.distutils.core import setup + setup(configuration=configuration) diff --git a/src/silx/opencl/sinofilter.py b/src/silx/opencl/sinofilter.py new file mode 100644 index 0000000..d608744 --- /dev/null +++ b/src/silx/opencl/sinofilter.py @@ -0,0 +1,435 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2016-2019 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Module for sinogram filtering on CPU/GPU.""" + +from __future__ import absolute_import, print_function, with_statement, division + +__authors__ = ["P. Paleo"] +__license__ = "MIT" +__date__ = "07/06/2019" + +import numpy as np +from math import pi + + +import pyopencl.array as parray +from .common import pyopencl as cl +from .processing import OpenclProcessing +from ..math.fft.clfft import CLFFT, __have_clfft__ +from ..math.fft.npfft import NPFFT +from ..image.tomography import generate_powers, get_next_power, compute_fourier_filter +from ..utils.deprecation import deprecated + + + +class SinoFilter(OpenclProcessing): + """A class for performing sinogram filtering on GPU using OpenCL. + + This is a convolution in the Fourier space, along one dimension: + + - In 2D: (n_a, d_x): n_a filterings (1D FFT of size d_x) + - In 3D: (n_z, n_a, d_x): n_z*n_a filterings (1D FFT of size d_x) + """ + kernel_files = ["array_utils.cl"] + powers = generate_powers() + + def __init__(self, sino_shape, filter_name=None, ctx=None, + devicetype="all", platformid=None, deviceid=None, + profile=False, extra_options=None): + """Constructor of OpenCL FFT-Convolve. + + :param sino_shape: shape of the sinogram. + :param filter_name: Name of the filter. Defaut is "ram-lak". + :param ctx: actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param platformid: integer with the platform_identifier, as given by + clinfo + :param deviceid: Integer with the device identifier, as given by clinfo + :param profile: switch on profiling to be able to profile at the kernel + level, store profiling elements (makes code slightly + slower) + :param dict extra_options: Advanced extra options. + Current options are: cutoff, use_numpy_fft + """ + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + profile=profile) + + self._init_extra_options(extra_options) + self._calculate_shapes(sino_shape) + self._init_fft() + self._allocate_memory() + self._compute_filter(filter_name) + self._init_kernels() + + def _calculate_shapes(self, sino_shape): + """ + + :param sino_shape: shape of the sinogram. + """ + self.ndim = len(sino_shape) + if self.ndim == 2: + n_angles, dwidth = sino_shape + else: + raise ValueError("Invalid sinogram number of dimensions: " + "expected 2 dimensions") + self.sino_shape = sino_shape + self.n_angles = n_angles + self.dwidth = dwidth + self.dwidth_padded = get_next_power(2 * self.dwidth, powers=self.powers) + self.sino_padded_shape = (n_angles, self.dwidth_padded) + sino_f_shape = list(self.sino_padded_shape) + sino_f_shape[-1] = sino_f_shape[-1] // 2 + 1 + self.sino_f_shape = tuple(sino_f_shape) + + def _init_extra_options(self, extra_options): + """ + + :param dict extra_options: Advanced extra options. + Current options are: cutoff, + """ + self.extra_options = { + "cutoff": 1., + "use_numpy_fft": False, + } + if extra_options is not None: + self.extra_options.update(extra_options) + + def _init_fft(self): + if __have_clfft__ and not(self.extra_options["use_numpy_fft"]): + self.fft_backend = "opencl" + self.fft = CLFFT( + self.sino_padded_shape, + dtype=np.float32, + axes=(-1,), + ctx=self.ctx, + ) + else: + self.fft_backend = "numpy" + print("The gpyfft module was not found. The Fourier transforms " + "will be done on CPU. For more performances, it is advised " + "to install gpyfft.""") + self.fft = NPFFT( + template=np.zeros(self.sino_padded_shape, "f"), + axes=(-1,), + ) + + def _allocate_memory(self): + self.d_filter_f = parray.zeros(self.queue, (self.sino_f_shape[-1],), np.complex64) + self.is_cpu = (self.device.type == "CPU") + # These are already allocated by FFT() if using the opencl backend + if self.fft_backend == "opencl": + self.d_sino_padded = self.fft.data_in + self.d_sino_f = self.fft.data_out + else: + # When using the numpy backend, arrays are not pre-allocated + self.d_sino_padded = np.zeros(self.sino_padded_shape, "f") + self.d_sino_f = np.zeros(self.sino_f_shape, np.complex64) + # These are needed for rectangular memcpy in certain cases (see below). + self.tmp_sino_device = parray.zeros(self.queue, self.sino_shape, "f") + self.tmp_sino_host = np.zeros(self.sino_shape, "f") + + def _compute_filter(self, filter_name): + """ + + :param str filter_name: filter name + """ + self.filter_name = filter_name or "ram-lak" + filter_f = compute_fourier_filter( + self.dwidth_padded, + self.filter_name, + cutoff=self.extra_options["cutoff"], + )[:self.dwidth_padded // 2 + 1] # R2C + self.set_filter(filter_f, normalize=True) + + def set_filter(self, h_filt, normalize=True): + """ + Set a filter for sinogram filtering. + + :param h_filt: Filter. Each line of the sinogram will be filtered with + this filter. It has to be the Real-to-Complex Fourier Transform + of some real filter, padded to 2*sinogram_width. + :param normalize: Whether to normalize the filter with pi/num_angles. + """ + if h_filt.size != self.sino_f_shape[-1]: + raise ValueError( + """ + Invalid filter size: expected %d, got %d. + Please check that the filter is the Fourier R2C transform of + some real 1D filter. + """ + % (self.sino_f_shape[-1], h_filt.size) + ) + if not(np.iscomplexobj(h_filt)): + print("Warning: expected a complex Fourier filter") + self.filter_f = h_filt + if normalize: + self.filter_f *= pi / self.n_angles + self.filter_f = self.filter_f.astype(np.complex64) + self.d_filter_f[:] = self.filter_f[:] + + def _init_kernels(self): + OpenclProcessing.compile_kernels(self, self.kernel_files) + h, w = self.d_sino_f.shape + self.mult_kern_args = (self.queue, (int(w), (int(h))), None, + self.d_sino_f.data, + self.d_filter_f.data, + np.int32(w), + np.int32(h)) + + def check_array(self, arr): + if arr.dtype != np.float32: + raise ValueError("Expected data type = numpy.float32") + if arr.shape != self.sino_shape: + raise ValueError("Expected sinogram shape %s, got %s" % + (self.sino_shape, arr.shape)) + if not(isinstance(arr, np.ndarray) or isinstance(arr, parray.Array)): + raise ValueError("Expected either numpy.ndarray or " + "pyopencl.array.Array") + + def copy2d(self, dst, src, transfer_shape, dst_offset=(0, 0), + src_offset=(0, 0)): + """ + + :param dst: + :param src: + :param transfer_shape: + :param dst_offset: + :param src_offset: + """ + shape = tuple(int(i) for i in transfer_shape[::-1]) + ev = self.kernels.cpy2d(self.queue, shape, None, + dst.data, + src.data, + np.int32(dst.shape[1]), + np.int32(src.shape[1]), + np.int32(dst_offset), + np.int32(src_offset), + np.int32(transfer_shape[::-1])) + ev.wait() + + def copy2d_host(self, dst, src, transfer_shape, dst_offset=(0, 0), + src_offset=(0, 0)): + """ + + :param dst: + :param src: + :param transfer_shape: + :param dst_offset: + :param src_offset: + """ + s = transfer_shape + do = dst_offset + so = src_offset + dst[do[0]:do[0] + s[0], do[1]:do[1] + s[1]] = src[so[0]:so[0] + s[0], so[1]:so[1] + s[1]] + + def _prepare_input_sino(self, sino): + """ + :param sino: sinogram + """ + self.check_array(sino) + self.d_sino_padded.fill(0) + if self.fft_backend == "opencl": + # OpenCL backend: FFT/mult/IFFT are done on device. + if isinstance(sino, np.ndarray): + # OpenCL backend + numpy input: copy H->D. + # As pyopencl does not support rectangular copies, we have to + # do a copy H->D in a temporary device buffer, and then call a + # kernel doing the rectangular D-D copy. + self.tmp_sino_device[:] = sino[:] + if self.is_cpu: + self.tmp_sino_device.finish() + d_sino_ref = self.tmp_sino_device + else: + d_sino_ref = sino + # Rectangular copy D->D + self.copy2d(self.d_sino_padded, d_sino_ref, self.sino_shape) + if self.is_cpu: + self.d_sino_padded.finish() # should not be required here + else: + # Numpy backend: FFT/mult/IFFT are done on host. + if not(isinstance(sino, np.ndarray)): + # Numpy backend + pyopencl input: need to copy D->H + self.tmp_sino_host[:] = sino[:] + h_sino_ref = self.tmp_sino_host + else: + h_sino_ref = sino + # Rectangular copy H->H + self.copy2d_host(self.d_sino_padded, h_sino_ref, self.sino_shape) + + def _get_output_sino(self, output): + """ + :param Union[numpy.dtype,None] output: sinogram output. + :return: sinogram + """ + if output is None: + res = np.zeros(self.sino_shape, dtype=np.float32) + else: + res = output + if self.fft_backend == "opencl": + if isinstance(res, np.ndarray): + # OpenCL backend + numpy output: copy D->H + # As pyopencl does not support rectangular copies, we first have + # to call a kernel doing rectangular copy D->D, then do a copy + # D->H. + self.copy2d(dst=self.tmp_sino_device, + src=self.d_sino_padded, + transfer_shape=self.sino_shape) + if self.is_cpu: + self.tmp_sino_device.finish() # should not be required here + res[:] = self.tmp_sino_device.get()[:] + else: + if self.is_cpu: + self.d_sino_padded.finish() + self.copy2d(res, self.d_sino_padded, self.sino_shape) + if self.is_cpu: + res.finish() # should not be required here + else: + if not(isinstance(res, np.ndarray)): + # Numpy backend + pyopencl output: rect copy H->H + copy H->D + self.copy2d_host(dst=self.tmp_sino_host, + src=self.d_sino_padded, + transfer_shape=self.sino_shape) + res[:] = self.tmp_sino_host[:] + else: + # Numpy backend + numpy output: rect copy H->H + self.copy2d_host(res, self.d_sino_padded, self.sino_shape) + return res + + def _do_fft(self): + if self.fft_backend == "opencl": + self.fft.fft(self.d_sino_padded, output=self.d_sino_f) + if self.is_cpu: + self.d_sino_f.finish() + else: + # numpy backend does not support "output=" argument, + # and rfft always return a complex128 result. + res = self.fft.fft(self.d_sino_padded).astype(np.complex64) + self.d_sino_f[:] = res[:] + + def _multiply_fourier(self): + if self.fft_backend == "opencl": + # Everything is on device. Call the multiplication kernel. + ev = self.kernels.inplace_complex_mul_2Dby1D( + *self.mult_kern_args + ) + ev.wait() + if self.is_cpu: + self.d_sino_f.finish() # should not be required here + else: + # Everything is on host. + self.d_sino_f *= self.filter_f + + def _do_ifft(self): + if self.fft_backend == "opencl": + if self.is_cpu: + self.d_sino_padded.fill(0) + self.d_sino_padded.finish() + self.fft.ifft(self.d_sino_f, output=self.d_sino_padded) + if self.is_cpu: + self.d_sino_padded.finish() + else: + # numpy backend does not support "output=" argument, + # and irfft always return a float64 result. + res = self.fft.ifft(self.d_sino_f).astype("f") + self.d_sino_padded[:] = res[:] + + def filter_sino(self, sino, output=None): + """ + + :param sino: sinogram + :param output: + :return: filtered sinogram + """ + # Handle input sinogram + self._prepare_input_sino(sino) + # FFT + self._do_fft() + # multiply with filter in the Fourier domain + self._multiply_fourier() + # iFFT + self._do_ifft() + # return + res = self._get_output_sino(output) + return res + # ~ return output + + __call__ = filter_sino + + + + +# ------------------- +# - Compatibility - +# ------------------- + + +def nextpow2(N): + p = 1 + while p < N: + p *= 2 + return p + + +@deprecated(replacement="Backprojection.sino_filter", since_version="0.10") +def fourier_filter(sino, filter_=None, fft_size=None): + """Simple np based implementation of fourier space filter. + This function is deprecated, please use silx.opencl.sinofilter.SinoFilter. + + :param sino: of shape shape = (num_projs, num_bins) + :param filter: filter function to apply in fourier space + :fft_size: size on which perform the fft. May be larger than the sino array + :return: filtered sinogram + """ + assert sino.ndim == 2 + num_projs, num_bins = sino.shape + if fft_size is None: + fft_size = nextpow2(num_bins * 2 - 1) + else: + assert fft_size >= num_bins + if fft_size == num_bins: + sino_zeropadded = sino.astype(np.float32) + else: + sino_zeropadded = np.zeros((num_projs, fft_size), + dtype=np.complex64) + sino_zeropadded[:, :num_bins] = sino.astype(np.float32) + + if filter_ is None: + h = np.zeros(fft_size, dtype=np.float32) + L2 = fft_size // 2 + 1 + h[0] = 1 / 4. + j = np.linspace(1, L2, L2 // 2, False) + h[1:L2:2] = -1. / (np.pi ** 2 * j ** 2) + h[L2:] = np.copy(h[1:L2 - 1][::-1]) + filter_ = np.fft.fft(h).astype(np.complex64) + + # Linear convolution + sino_f = np.fft.fft(sino, fft_size) + sino_f = sino_f * filter_ + sino_filtered = np.fft.ifft(sino_f)[:, :num_bins].real + + return np.ascontiguousarray(sino_filtered.real, dtype=np.float32) diff --git a/src/silx/opencl/sparse.py b/src/silx/opencl/sparse.py new file mode 100644 index 0000000..514589a --- /dev/null +++ b/src/silx/opencl/sparse.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2019 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Module for data sparsification on CPU/GPU.""" + +from __future__ import absolute_import, print_function, with_statement, division + +__authors__ = ["P. Paleo"] +__license__ = "MIT" +__date__ = "07/06/2019" + +import numpy +import pyopencl.array as parray +from collections import namedtuple +from pyopencl.scan import GenericScanKernel +from pyopencl.tools import dtype_to_ctype +from .common import pyopencl as cl +from .processing import OpenclProcessing, EventDescription, BufferDescription +mf = cl.mem_flags + + +CSRData = namedtuple("CSRData", ["data", "indices", "indptr"]) + +def tuple_to_csrdata(arrs): + """ + Converts a 3-tuple to a CSRData namedtuple. + """ + if arrs is None: + return None + return CSRData(data=arrs[0], indices=arrs[1], indptr=arrs[2]) + + + +class CSR(OpenclProcessing): + kernel_files = ["sparse.cl"] + + def __init__(self, shape, dtype="f", max_nnz=None, idx_dtype=numpy.int32, + ctx=None, devicetype="all", platformid=None, deviceid=None, + block_size=None, memory=None, profile=False): + """ + Compute Compressed Sparse Row format of an image (2D matrix). + It is designed to be compatible with scipy.sparse.csr_matrix. + + :param shape: tuple + Matrix shape. + :param dtype: str or numpy.dtype, optional + Numeric data type. By default, sparse matrix data will be float32. + :param max_nnz: int, optional + Maximum number of non-zero elements. By default, the arrays "data" + and "indices" are allocated with prod(shape) elements, but + in practice a much lesser space is needed. + The number of non-zero items cannot be known in advance, but one can + estimate an upper-bound with this parameter to save memory. + + Opencl processing parameters + ----------------------------- + Please refer to the documentation of silx.opencl.processing.OpenclProcessing + for information on the other parameters. + """ + + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + block_size=block_size, memory=memory, + profile=profile) + self._set_parameters(shape, dtype, max_nnz, idx_dtype) + self._allocate_memory() + self._setup_kernels() + + # -------------------------------------------------------------------------- + # -------------------------- Initialization -------------------------------- + # -------------------------------------------------------------------------- + + def _set_parameters(self, shape, dtype, max_nnz, idx_dtype): + self.shape = shape + self.size = numpy.prod(shape) + self._set_idx_dtype(idx_dtype) + assert len(shape) == 2 # + if max_nnz is None: + self.max_nnz = numpy.prod(shape) # worst case + else: + self.max_nnz = int(max_nnz) + self._set_dtype(dtype) + + + def _set_idx_dtype(self, idx_dtype): + idx_dtype = numpy.dtype(idx_dtype) + if idx_dtype.kind not in ["i", "u"]: + raise ValueError("Not an integer type: %s" % idx_dtype) + # scan value type must have size divisible by 4 bytes + if idx_dtype.itemsize % 4 != 0: + raise ValueError("Due to an internal pyopencl limitation, idx_dtype type must have size divisible by 4 bytes") + self.indice_dtype = idx_dtype # + + + def _set_dtype(self, dtype): + self.dtype = numpy.dtype(dtype) + if self.dtype.kind == "c": + raise ValueError("Complex data is not supported") + if self.dtype == numpy.dtype(numpy.float32): + self._c_zero_str = "0.0f" + elif self.dtype == numpy.dtype(numpy.float64): + self._c_zero_str = "0.0" + else: # assuming integer + self._c_zero_str = "0" + self.c_dtype = dtype_to_ctype(self.dtype) + self.idx_c_dtype = dtype_to_ctype(self.indice_dtype) + + + def _allocate_memory(self): + self.is_cpu = (self.device.type == "CPU") # move to OpenclProcessing ? + self.buffers = [ + BufferDescription("array", (self.size,), self.dtype, mf.READ_ONLY), + BufferDescription("data", (self.max_nnz,), self.dtype, mf.READ_WRITE), + BufferDescription("indices", (self.max_nnz,), self.indice_dtype, mf.READ_WRITE), + BufferDescription("indptr", (self.shape[0]+1,), self.indice_dtype, mf.READ_WRITE), + ] + self.allocate_buffers(use_array=True) + for arr_name in ["array", "data", "indices", "indptr"]: + setattr(self, arr_name, self.cl_mem[arr_name]) + self.cl_mem[arr_name].fill(0) # allocate_buffers() uses empty() + self._old_array = self.array + self._old_data = self.data + self._old_indices = self.indices + self._old_indptr = self.indptr + + + def _setup_kernels(self): + self._setup_compaction_kernel() + self._setup_decompaction_kernel() + + + def _setup_compaction_kernel(self): + kernel_signature = str( + "__global %s *data, \ + __global %s *data_compacted, \ + __global %s *indices, \ + __global %s* indptr \ + """ % (self.c_dtype, self.c_dtype, self.idx_c_dtype, self.idx_c_dtype) + ) + if self.dtype.kind == "f": + map_nonzero_expr = "(fabs(data[i]) > %s) ? 1 : 0" % self._c_zero_str + elif self.dtype.kind in ["u", "i"]: + map_nonzero_expr = "(data[i] != %s) ? 1 : 0" % self._c_zero_str + else: + raise ValueError("Unknown data type") + + self.scan_kernel = GenericScanKernel( + self.ctx, self.indice_dtype, + arguments=kernel_signature, + input_expr=map_nonzero_expr, + scan_expr="a+b", neutral="0", + output_statement=""" + // item is the running sum of input_expr(i), i.e the cumsum of "nonzero" + if (prev_item != item) { + data_compacted[item-1] = data[i]; + indices[item-1] = GET_INDEX(i); + } + // The last cumsum element of each line of "nonzero" goes to inptr[i] + if ((i+1) % IMAGE_WIDTH == 0) { + indptr[(i/IMAGE_WIDTH)+1] = item; + } + """, + options=["-DIMAGE_WIDTH=%d" % self.shape[1]], + preamble="#define GET_INDEX(i) (i % IMAGE_WIDTH)", + ) + + + def _setup_decompaction_kernel(self): + OpenclProcessing.compile_kernels( + self, + self.kernel_files, + compile_options=[ + "-DIMAGE_WIDTH=%d" % self.shape[1], + "-DDTYPE=%s" % self.c_dtype, + "-DIDX_DTYPE=%s" % self.idx_c_dtype, + ] + ) + device = self.ctx.devices[0] + wg_x = min( + device.max_work_group_size, + 32, + self.kernels.max_workgroup_size("densify_csr") + ) + self._decomp_wg = (wg_x, 1) + self._decomp_grid = (self._decomp_wg[0], self.shape[0]) + + + # -------------------------------------------------------------------------- + # -------------------------- Array utils ----------------------------------- + # -------------------------------------------------------------------------- + + # TODO handle pyopencl Buffer + def check_array(self, arr): + """ + Check that provided array is compatible with current context. + + :param arr: numpy.ndarray or pyopencl.array.Array + 2D array in dense format. + """ + assert arr.size == self.size + assert arr.dtype == self.dtype + + + # TODO handle pyopencl Buffer + def check_sparse_arrays(self, csr_data): + """ + Check that the provided sparse arrays are compatible with the current + context. + + :param arrays: namedtuple CSRData. + It contains the arrays "data", "indices", "indptr" + """ + assert isinstance(csr_data, CSRData) + for arr in [csr_data.data, csr_data.indices, csr_data.indptr]: + assert arr.ndim == 1 + assert csr_data.data.size <= self.max_nnz + assert csr_data.indices.size <= self.max_nnz + assert csr_data.indptr.size == self.shape[0]+1 + assert csr_data.data.dtype == self.dtype + assert csr_data.indices.dtype == self.indice_dtype + assert csr_data.indptr.dtype == self.indice_dtype + + + def set_array(self, arr): + """ + Set the provided array as the current context 2D matrix. + + :param arr: numpy.ndarray or pyopencl.array.Array + 2D array in dense format. + """ + if arr is None: + return + self.check_array(arr) + # GenericScanKernel only supports 1D data + if isinstance(arr, parray.Array): + self._old_array = self.array + self.array = arr + elif isinstance(arr, numpy.ndarray): + self.array[:] = arr.ravel()[:] + else: + raise ValueError("Expected pyopencl array or numpy array") + + + def set_sparse_arrays(self, csr_data): + if csr_data is None: + return + self.check_sparse_arrays(csr_data) + for name, arr in {"data": csr_data.data, "indices": csr_data.indices, "indptr": csr_data.indptr}.items(): + # The current array is a device array. Don't copy, use it directly + if isinstance(arr, parray.Array): + setattr(self, "_old_" + name, getattr(self, name)) + setattr(self, name, arr) + # The current array is a numpy.ndarray: copy H2D + elif isinstance(arr, numpy.ndarray): + getattr(self, name)[:arr.size] = arr[:] + else: + raise ValueError("Unsupported array type: %s" % type(arr)) + + + def _recover_arrays_references(self): + """ + Recover the previous arrays references, and return the references of the + "current" arrays. + """ + array = self.array + data = self.data + indices = self.indices + indptr = self.indptr + for name in ["array", "data", "indices", "indptr"]: + # self.X = self._old_X + setattr(self, name, getattr(self, "_old_" + name)) + return array, (data, indices, indptr) + + + def get_sparse_arrays(self, output): + """ + Get the 2D dense array of the current context. + + :param output: tuple or None + tuple in the form (data, indices, indptr). These arrays have to be + compatible with the current context (size and data type). + The content of these arrays will be overwritten with the result of + the previous computation. + """ + numels = self.max_nnz + if output is None: + data = self.data.get()[:numels] + ind = self.indices.get()[:numels] + indptr = self.indptr.get() + res = (data, ind, indptr) + else: + res = output + return res + + + def get_array(self, output): + if output is None: + res = self.array.get().reshape(self.shape) + else: + res = output + return res + + # -------------------------------------------------------------------------- + # -------------------------- Compaction ------------------------------------ + # -------------------------------------------------------------------------- + + def sparsify(self, arr, output=None): + """ + Convert an image (2D matrix) into a CSR representation. + + :param arr: numpy.ndarray or pyopencl.array.Array + Input array. + :param output: tuple of pyopencl.array.Array, optional + If provided, this must be a tuple of 3 arrays (data, indices, indptr). + The content of each array is overwritten by the computation result. + """ + self.set_array(arr) + self.set_sparse_arrays(tuple_to_csrdata(output)) + evt = self.scan_kernel( + self.array, + self.data, + self.indices, + self.indptr, + ) + #~ evt.wait() + self.profile_add(evt, "sparsification kernel") + res = self.get_sparse_arrays(output) + self._recover_arrays_references() + return res + + # -------------------------------------------------------------------------- + # -------------------------- Decompaction ---------------------------------- + # -------------------------------------------------------------------------- + + def densify(self, data, indices, indptr, output=None): + self.set_sparse_arrays( + CSRData(data=data, indices=indices, indptr=indptr) + ) + self.set_array(output) + evt = self.kernels.densify_csr( + self.queue, + self._decomp_grid, + self._decomp_wg, + self.data.data, + self.indices.data, + self.indptr.data, + self.array.data, + numpy.int32(self.shape[0]), + ) + #~ evt.wait() + self.profile_add(evt, "desparsification kernel") + res = self.get_array(output) + self._recover_arrays_references() + return res + diff --git a/src/silx/opencl/statistics.py b/src/silx/opencl/statistics.py new file mode 100644 index 0000000..a96ee33 --- /dev/null +++ b/src/silx/opencl/statistics.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- +# +# Project: SILX +# https://github.com/silx-kit/silx +# +# Copyright (C) 2012-2019 European Synchrotron Radiation Facility, Grenoble, France +# +# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# . +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# . +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +"""A module for performing basic statistical analysis (min, max, mean, std) on +large data where numpy is not very efficient. +""" + +__author__ = "Jerome Kieffer" +__license__ = "MIT" +__date__ = "19/05/2021" +__copyright__ = "2012-2019, ESRF, Grenoble" +__contact__ = "jerome.kieffer@esrf.fr" + +import logging +import numpy +from collections import OrderedDict, namedtuple +from math import sqrt + +from .common import pyopencl +from .processing import EventDescription, OpenclProcessing, BufferDescription +from .utils import concatenate_cl_kernel + +if pyopencl: + mf = pyopencl.mem_flags + from pyopencl.reduction import ReductionKernel + try: + from pyopencl import cltypes + except ImportError: + v = pyopencl.array.vec() + float8 = v.float8 + else: + float8 = cltypes.float8 + +else: + raise ImportError("pyopencl is not installed") +logger = logging.getLogger(__name__) + +StatResults = namedtuple("StatResults", ["min", "max", "cnt", "sum", "mean", + "var", "std"]) +zero8 = "(float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)" +# min max cnt cnt_e sum sum_e var var_e + + +class Statistics(OpenclProcessing): + """A class for doing statistical analysis using OpenCL + + :param List[int] size: Shape of input data to treat + :param numpy.dtype dtype: Input data type + :param numpy.ndarray template: Data template to extract size & dtype + :param ctx: Actual working context, left to None for automatic + initialization from device type or platformid/deviceid + :param str devicetype: Type of device, can be "CPU", "GPU", "ACC" or "ALL" + :param int platformid: Platform identifier as given by clinfo + :param int deviceid: Device identifier as given by clinfo + :param int block_size: + Preferred workgroup size, may vary depending on the outcome of the compilation + :param bool profile: + Switch on profiling to be able to profile at the kernel level, + store profiling elements (makes code slightly slower) + """ + buffers = [ + BufferDescription("raw", 1, numpy.float32, mf.READ_ONLY), + BufferDescription("converted", 1, numpy.float32, mf.READ_WRITE), + ] + kernel_files = ["preprocess.cl"] + mapping = {numpy.int8: "s8_to_float", + numpy.uint8: "u8_to_float", + numpy.int16: "s16_to_float", + numpy.uint16: "u16_to_float", + numpy.uint32: "u32_to_float", + numpy.int32: "s32_to_float"} + + def __init__(self, size=None, dtype=None, template=None, + ctx=None, devicetype="all", platformid=None, deviceid=None, + block_size=None, profile=False + ): + OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, + platformid=platformid, deviceid=deviceid, + block_size=block_size, profile=profile) + self.size = size + self.dtype = dtype + if template is not None: + self.size = template.size + self.dtype = template.dtype + + self.buffers = [BufferDescription(i.name, i.size * self.size, i.dtype, i.flags) + for i in self.__class__.buffers] + + self.allocate_buffers(use_array=True) + self.compile_kernels() + self.set_kernel_arguments() + + def set_kernel_arguments(self): + """Parametrize all kernel arguments""" + for val in self.mapping.values(): + self.cl_kernel_args[val] = OrderedDict(((i, self.cl_mem[i]) for i in ("raw", "converted"))) + + def compile_kernels(self): + """Compile the kernel""" + OpenclProcessing.compile_kernels(self, + self.kernel_files, + "-D NIMAGE=%i" % self.size) + compiler_options = self.get_compiler_options(x87_volatile=True) + src = concatenate_cl_kernel(("doubleword.cl", "statistics.cl")) + self.reduction_comp = ReductionKernel(self.ctx, + dtype_out=float8, + neutral=zero8, + map_expr="map_statistics(data, i)", + reduce_expr="reduce_statistics(a,b)", + arguments="__global float *data", + preamble=src, + options=compiler_options) + self.reduction_simple = ReductionKernel(self.ctx, + dtype_out=float8, + neutral=zero8, + map_expr="map_statistics(data, i)", + reduce_expr="reduce_statistics_simple(a,b)", + arguments="__global float *data", + preamble=src, + options=compiler_options) + + if "cl_khr_fp64" in self.device.extensions: + self.reduction_double = ReductionKernel(self.ctx, + dtype_out=float8, + neutral=zero8, + map_expr="map_statistics(data, i)", + reduce_expr="reduce_statistics_double(a,b)", + arguments="__global float *data", + preamble=src, + options=compiler_options) + else: + logger.info("Device %s does not support double-precision arithmetics, fall-back on compensated one", self.device) + self.reduction_double = self.reduction_comp + + def send_buffer(self, data, dest): + """ + Send a numpy array to the device, including the cast on the device if + possible + + :param numpy.ndarray data: numpy array with data + :param dest: name of the buffer as registered in the class + """ + logger.info("send data to %s", dest) + dest_type = numpy.dtype([i.dtype for i in self.buffers if i.name == dest][0]) + events = [] + if (data.dtype == dest_type) or (data.dtype.itemsize > dest_type.itemsize): + copy_image = pyopencl.enqueue_copy(self.queue, + self.cl_mem[dest].data, + numpy.ascontiguousarray(data, dest_type)) + events.append(EventDescription("copy H->D %s" % dest, copy_image)) + else: + copy_image = pyopencl.enqueue_copy(self.queue, + self.cl_mem["raw"].data, + numpy.ascontiguousarray(data)) + kernel = getattr(self.program, self.mapping[data.dtype.type]) + cast_to_float = kernel(self.queue, + (self.size,), + None, + self.cl_mem["raw"].data, + self.cl_mem[dest].data) + events += [ + EventDescription("copy H->D raw", copy_image), + EventDescription(f"cast to float {dest}", cast_to_float) + ] + if self.profile: + self.events += events + return events + + def process(self, data, comp=True): + """Actually calculate the statics on the data + + :param numpy.ndarray data: numpy array with the image + :param comp: use Kahan compensated arithmetics for the calculation + :return: Statistics named tuple + :rtype: StatResults + """ + if data.ndim != 1: + data = data.ravel() + size = data.size + assert size <= self.size, "size is OK" + events = [] + if comp is True: + comp = "comp" + elif comp is False: + comp = "single" + else: + comp = comp.lower() + with self.sem: + self.send_buffer(data, "converted") + if comp in ("single", "fp32", "float32"): + reduction = self.reduction_simple + elif comp in ("double", "fp64", "float64"): + reduction = self.reduction_double + else: + reduction = self.reduction_comp + res_d, evt = reduction(self.cl_mem["converted"][:self.size], + queue=self.queue, + return_event=True) + events.append(EventDescription(f"statistical reduction {comp}", evt)) + if self.profile: + self.events += events + res_h = res_d.get() + min_ = 1.0 * res_h["s0"] + max_ = 1.0 * res_h["s1"] + count = 1.0 * res_h["s2"] + res_h["s3"] + sum_ = 1.0 * res_h["s4"] + res_h["s5"] + m2 = 1.0 * res_h["s6"] + res_h["s7"] + var = m2 / (count - 1.0) + res = StatResults(min_, + max_, + count, + sum_, + sum_ / count, + var, + sqrt(var)) + return res + + __call__ = process diff --git a/src/silx/opencl/test/__init__.py b/src/silx/opencl/test/__init__.py new file mode 100644 index 0000000..92cda4a --- /dev/null +++ b/src/silx/opencl/test/__init__.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# +# Project: silx +# https://github.com/silx-kit/silx +# +# Copyright (C) 2012-2016 European Synchrotron Radiation Facility, Grenoble, France +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. diff --git a/src/silx/opencl/test/test_addition.py b/src/silx/opencl/test/test_addition.py new file mode 100644 index 0000000..3b668bf --- /dev/null +++ b/src/silx/opencl/test/test_addition.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Project: Sift implementation in Python + OpenCL +# https://github.com/silx-kit/silx +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +""" +Simple test of an addition +""" + +__authors__ = ["Henri Payno, Jérôme Kieffer"] +__contact__ = "jerome.kieffer@esrf.eu" +__license__ = "MIT" +__copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "30/11/2020" + +import logging +import numpy +import pytest + +import unittest +from ..common import ocl, _measure_workgroup_size, query_kernel_info +if ocl: + import pyopencl + import pyopencl.array +from ..utils import get_opencl_code +logger = logging.getLogger(__name__) + + +@unittest.skipUnless(ocl, "PyOpenCl is missing") +class TestAddition(unittest.TestCase): + + @classmethod + def setUpClass(cls): + super(TestAddition, cls).setUpClass() + if ocl: + cls.ctx = ocl.create_context() + if logger.getEffectiveLevel() <= logging.INFO: + cls.PROFILE = True + cls.queue = pyopencl.CommandQueue( + cls.ctx, + properties=pyopencl.command_queue_properties.PROFILING_ENABLE) + else: + cls.PROFILE = False + cls.queue = pyopencl.CommandQueue(cls.ctx) + cls.max_valid_wg = 0 + + @classmethod + def tearDownClass(cls): + super(TestAddition, cls).tearDownClass() + print("Maximum valid workgroup size %s on device %s" % (cls.max_valid_wg, cls.ctx.devices[0])) + cls.ctx = None + cls.queue = None + + def setUp(self): + if ocl is None: + return + self.shape = 4096 + self.data = numpy.random.random(self.shape).astype(numpy.float32) + self.d_array_img = pyopencl.array.to_device(self.queue, self.data) + self.d_array_5 = pyopencl.array.empty_like(self.d_array_img) + self.d_array_5.fill(-5) + self.program = pyopencl.Program(self.ctx, get_opencl_code("addition")).build() + + def tearDown(self): + self.img = self.data = None + self.d_array_img = self.d_array_5 = self.program = None + + def test_add(self): + """ + tests the addition kernel + """ + maxi = int(round(numpy.log2(self.shape))) + for i in range(maxi): + d_array_result = pyopencl.array.empty_like(self.d_array_img) + wg = 1 << i + try: + evt = self.program.addition(self.queue, (self.shape,), (wg,), + self.d_array_img.data, self.d_array_5.data, d_array_result.data, numpy.int32(self.shape)) + evt.wait() + except Exception as error: + max_valid_wg = self.program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, self.ctx.devices[0]) + msg = "Error %s on WG=%s: %s" % (error, wg, max_valid_wg) + self.assertLess(max_valid_wg, wg, msg) + break + else: + res = d_array_result.get() + good = numpy.allclose(res, self.data - 5) + if good and wg > self.max_valid_wg: + self.__class__.max_valid_wg = wg + self.assertTrue(good, "calculation is correct for WG=%s" % wg) + + def test_measurement(self): + """ + tests that all devices are working properly ... lengthy and error prone + """ + for platform in ocl.platforms: + for did, device in enumerate(platform.devices): + meas = _measure_workgroup_size((platform.id, device.id)) + self.assertEqual(meas, device.max_work_group_size, + "Workgroup size for %s/%s: %s == %s" % (platform, device, meas, device.max_work_group_size)) + + def test_query(self): + """ + tests that all devices are working properly ... lengthy and error prone + """ + for what in ("COMPILE_WORK_GROUP_SIZE", + "LOCAL_MEM_SIZE", + "PREFERRED_WORK_GROUP_SIZE_MULTIPLE", + "PRIVATE_MEM_SIZE", + "WORK_GROUP_SIZE"): + logger.info("%s: %s", what, query_kernel_info(program=self.program, kernel="addition", what=what)) + + # Not all ICD work properly .... + #self.assertEqual(3, len(query_kernel_info(program=self.program, kernel="addition", what="COMPILE_WORK_GROUP_SIZE")), "3D kernel") + + min_wg = query_kernel_info(program=self.program, kernel="addition", what="PREFERRED_WORK_GROUP_SIZE_MULTIPLE") + max_wg = query_kernel_info(program=self.program, kernel="addition", what="WORK_GROUP_SIZE") + self.assertEqual(max_wg % min_wg, 0, msg="max_wg is a multiple of min_wg") diff --git a/src/silx/opencl/test/test_array_utils.py b/src/silx/opencl/test/test_array_utils.py new file mode 100644 index 0000000..325a6c3 --- /dev/null +++ b/src/silx/opencl/test/test_array_utils.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2016 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Test of the OpenCL array_utils""" + +from __future__ import division, print_function + +__authors__ = ["Pierre paleo"] +__license__ = "MIT" +__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "14/06/2017" + + +import time +import logging +import numpy as np +import unittest +try: + import mako +except ImportError: + mako = None +from ..common import ocl +if ocl: + import pyopencl as cl + import pyopencl.array as parray + from .. import linalg +from ..utils import get_opencl_code +from silx.test.utils import utilstest + +logger = logging.getLogger(__name__) +try: + from scipy.ndimage.filters import laplace + _has_scipy = True +except ImportError: + _has_scipy = False + + + +@unittest.skipUnless(ocl and mako, "PyOpenCl is missing") +class TestCpy2d(unittest.TestCase): + + def setUp(self): + if ocl is None: + return + self.ctx = ocl.create_context() + if logger.getEffectiveLevel() <= logging.INFO: + self.PROFILE = True + self.queue = cl.CommandQueue( + self.ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + else: + self.PROFILE = False + self.queue = cl.CommandQueue(self.ctx) + self.allocate_arrays() + self.program = cl.Program(self.ctx, get_opencl_code("array_utils")).build() + + def allocate_arrays(self): + """ + Allocate various types of arrays for the tests + """ + self.prng_state = np.random.get_state() + # Generate arrays of random shape + self.shape1 = np.random.randint(20, high=512, size=(2,)) + self.shape2 = np.random.randint(20, high=512, size=(2,)) + self.array1 = np.random.rand(*self.shape1).astype(np.float32) + self.array2 = np.random.rand(*self.shape2).astype(np.float32) + self.d_array1 = parray.to_device(self.queue, self.array1) + self.d_array2 = parray.to_device(self.queue, self.array2) + # Generate random offsets + offset1_y = np.random.randint(2, high=min(self.shape1[0], self.shape2[0]) - 10) + offset1_x = np.random.randint(2, high=min(self.shape1[1], self.shape2[1]) - 10) + offset2_y = np.random.randint(2, high=min(self.shape1[0], self.shape2[0]) - 10) + offset2_x = np.random.randint(2, high=min(self.shape1[1], self.shape2[1]) - 10) + self.offset1 = (offset1_y, offset1_x) + self.offset2 = (offset2_y, offset2_x) + # Compute the size of the rectangle to transfer + size_y = np.random.randint(2, high=min(self.shape1[0], self.shape2[0]) - max(offset1_y, offset2_y) + 1) + size_x = np.random.randint(2, high=min(self.shape1[1], self.shape2[1]) - max(offset1_x, offset2_x) + 1) + self.transfer_shape = (size_y, size_x) + + def tearDown(self): + self.array1 = None + self.array2 = None + self.d_array1.data.release() + self.d_array2.data.release() + self.d_array1 = None + self.d_array2 = None + self.ctx = None + self.queue = None + + def compare(self, result, reference): + errmax = np.max(np.abs(result - reference)) + logger.info("Max error = %e" % (errmax)) + self.assertTrue(errmax == 0, str("Max error is too high"))#. PRNG state was %s" % str(self.prng_state))) + + @unittest.skipUnless(ocl and mako, "pyopencl is missing") + def test_cpy2d(self): + """ + Test rectangular transfer of self.d_array1 to self.d_array2 + """ + # Reference + o1 = self.offset1 + o2 = self.offset2 + T = self.transfer_shape + logger.info("""Testing D->D rectangular copy with (N1_y, N1_x) = %s, + (N2_y, N2_x) = %s: + array2[%d:%d, %d:%d] = array1[%d:%d, %d:%d]""" % + ( + str(self.shape1), str(self.shape2), + o2[0], o2[0] + T[0], + o2[1], o2[1] + T[1], + o1[0], o1[0] + T[0], + o1[1], o1[1] + T[1] + ) + ) + self.array2[o2[0]:o2[0] + T[0], o2[1]:o2[1] + T[1]] = self.array1[o1[0]:o1[0] + T[0], o1[1]:o1[1] + T[1]] + kernel_args = ( + self.d_array2.data, + self.d_array1.data, + np.int32(self.shape2[1]), + np.int32(self.shape1[1]), + np.int32(self.offset2[::-1]), + np.int32(self.offset1[::-1]), + np.int32(self.transfer_shape[::-1]) + ) + wg = None + ndrange = self.transfer_shape[::-1] + self.program.cpy2d(self.queue, ndrange, wg, *kernel_args) + res = self.d_array2.get() + self.compare(res, self.array2) diff --git a/src/silx/opencl/test/test_backprojection.py b/src/silx/opencl/test/test_backprojection.py new file mode 100644 index 0000000..96d56fa --- /dev/null +++ b/src/silx/opencl/test/test_backprojection.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2016 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Test of the filtered backprojection module""" + +from __future__ import division, print_function + +__authors__ = ["Pierre paleo"] +__license__ = "MIT" +__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "19/01/2018" + + +import time +import logging +import numpy as np +import unittest +from math import pi +try: + import mako +except ImportError: + mako = None +from ..common import ocl +if ocl: + from .. import backprojection + from ...image.tomography import compute_fourier_filter +from silx.test.utils import utilstest + +logger = logging.getLogger(__name__) + + +def generate_coords(img_shp, center=None): + """ + Return two 2D arrays containing the indexes of an image. + The zero is at the center of the image. + """ + l_r, l_c = float(img_shp[0]), float(img_shp[1]) + R, C = np.mgrid[:l_r, :l_c] + if center is None: + center0, center1 = l_r / 2., l_c / 2. + else: + center0, center1 = center + R = R + 0.5 - center0 + C = C + 0.5 - center1 + return R, C + + +def clip_circle(img, center=None, radius=None): + """ + Puts zeros outside the inscribed circle of the image support. + """ + R, C = generate_coords(img.shape, center) + M = R * R + C * C + res = np.zeros_like(img) + if radius is None: + radius = img.shape[0] / 2. - 1 + mask = M < radius * radius + res[mask] = img[mask] + return res + + +@unittest.skipUnless(ocl and mako, "PyOpenCl is missing") +class TestFBP(unittest.TestCase): + + def setUp(self): + if ocl is None: + return + self.getfiles() + self.fbp = backprojection.Backprojection(self.sino.shape, profile=True) + if self.fbp.compiletime_workgroup_size < 16 * 16: + self.skipTest("Current implementation of OpenCL backprojection is " + "not supported on this platform yet") + # Astra does not use the same backprojector implementation. + # Therefore, we cannot expect results to be the "same" (up to float32 + # numerical error) + self.tol = 5e-2 + if not(self.fbp._use_textures) or self.fbp.device.type == "CPU": + # Precision is less when using CPU + # (either CPU textures or "manual" linear interpolation) + self.tol *= 2 + + def tearDown(self): + self.sino = None + # self.fbp.log_profile() + self.fbp = None + + def getfiles(self): + # load sinogram of 512x512 MRI phantom + self.sino = np.load(utilstest.getfile("sino500.npz"))["data"] + # load reconstruction made with ASTRA FBP (with filter designed in spatial domain) + self.reference_rec = np.load(utilstest.getfile("rec_astra_500.npz"))["data"] + + def measure(self): + "Common measurement of timings" + t1 = time.time() + try: + result = self.fbp.filtered_backprojection(self.sino) + except RuntimeError as msg: + logger.error(msg) + return + t2 = time.time() + return t2 - t1, result + + def compare(self, res): + """ + Compare a result with the reference reconstruction. + Only the valid reconstruction zone (inscribed circle) is taken into + account + """ + res_clipped = clip_circle(res) + ref_clipped = clip_circle(self.reference_rec) + delta = abs(res_clipped - ref_clipped) + bad = delta > 1 + logger.debug("Absolute difference: %s with %s outlier pixels out of %s" + "", delta.max(), bad.sum(), np.prod(bad.shape)) + return delta.max() + + @unittest.skipUnless(ocl and mako, "pyopencl is missing") + def test_fbp(self): + """ + tests FBP + """ + # Test single reconstruction + # -------------------------- + t, res = self.measure() + if t is None: + logger.info("test_fp: skipped") + else: + logger.info("test_backproj: time = %.3fs" % t) + err = self.compare(res) + msg = str("Max error = %e" % err) + logger.info(msg) + self.assertTrue(err < self.tol, "Max error is too high") + + # Test multiple reconstructions + # ----------------------------- + res0 = np.copy(res) + for i in range(10): + res = self.fbp.filtered_backprojection(self.sino) + errmax = np.max(np.abs(res - res0)) + self.assertTrue(errmax < 1.e-6, "Max error is too high") + + @unittest.skipUnless(ocl and mako, "pyopencl is missing") + def test_fbp_filters(self): + """ + Test the different available filters of silx FBP. + """ + avail_filters = [ + "ramlak", "shepp-logan", "cosine", "hamming", + "hann" + ] + # Create a Dirac delta function at a single angle view. + # As the filters are radially invarant: + # - backprojection yields an image where each line is a Dirac. + # - FBP yields an image where each line is the spatial filter + # One can simply filter "dirac" without backprojecting it, but this + # test will also ensure that backprojection behaves well. + dirac = np.zeros_like(self.sino) + na, dw = dirac.shape + dirac[0, dw//2] = na / pi * 2 + + for filter_name in avail_filters: + B = backprojection.Backprojection(dirac.shape, filter_name=filter_name) + r = B(dirac) + # Check that radial invariance is kept + std0 = np.max(np.abs(np.std(r, axis=0))) + self.assertTrue( + std0 < 5.e-6, + "Something wrong with FBP(filter=%s)" % filter_name + ) + # Check that the filter is retrieved + r_f = np.fft.fft(np.fft.fftshift(r[0])).real / 2. # filter factor + ref_filter_f = compute_fourier_filter(dw, filter_name) + errmax = np.max(np.abs(r_f - ref_filter_f)) + logger.info("FBP filter %s: max error=%e" % (filter_name, errmax)) + self.assertTrue( + errmax < 1.e-3, + "Something wrong with FBP(filter=%s)" % filter_name + ) + + @unittest.skipUnless(ocl and mako, "pyopencl is missing") + def test_fbp_oddsize(self): + # Generate a 513-sinogram. + # The padded width will be nextpow(513*2). + # silx [0.10, 0.10.1] will give 1029, which makes R2C transform fail. + sino = np.pad(self.sino, ((0, 0), (1, 0)), mode='edge') + B = backprojection.Backprojection(sino.shape, axis_position=self.fbp.axis_pos+1) + res = B(sino) + # Compare with self.reference_rec. Tolerance is high as backprojector + # is not fully shift-invariant. + errmax = np.max(np.abs(clip_circle(res[1:, 1:] - self.reference_rec))) + self.assertLess( + errmax, 1.e-1, + "Something wrong with FBP on odd-sized sinogram" + ) diff --git a/src/silx/opencl/test/test_convolution.py b/src/silx/opencl/test/test_convolution.py new file mode 100644 index 0000000..6a2759d --- /dev/null +++ b/src/silx/opencl/test/test_convolution.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2019 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ + +""" +Test of the Convolution class. +""" + +from __future__ import division, print_function + +__authors__ = ["Pierre Paleo"] +__contact__ = "pierre.paleo@esrf.fr" +__license__ = "MIT" +__copyright__ = "2019 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "01/08/2019" + +import pytest +import logging +from itertools import product +import numpy as np +from silx.image.utils import gaussian_kernel + +try: + from scipy.ndimage import convolve, convolve1d + from scipy.misc import ascent + + scipy_convolve = convolve + scipy_convolve1d = convolve1d +except ImportError: + scipy_convolve = None +import unittest +from ..common import ocl, check_textures_availability + +if ocl: + import pyopencl as cl + import pyopencl.array as parray + from silx.opencl.convolution import Convolution +logger = logging.getLogger(__name__) + + +class ConvolutionData: + + def __init__(self, param): + self.param = param + self.mode = param["boundary_handling"] + logger.debug( + """ + Testing convolution with boundary_handling=%s, + use_textures=%s, input_device=%s, output_device=%s + """ + % ( + self.mode, + param["use_textures"], + param["input_on_device"], + param["output_on_device"], + ) + ) + + @classmethod + def setUpClass(cls): + cls.image = np.ascontiguousarray(ascent()[:, :511], dtype="f") + cls.data1d = cls.image[0] + cls.data2d = cls.image + cls.data3d = np.tile(cls.image[224:-224, 224:-224], (62, 1, 1)) + cls.kernel1d = gaussian_kernel(1.0) + cls.kernel2d = np.outer(cls.kernel1d, cls.kernel1d) + cls.kernel3d = np.multiply.outer(cls.kernel2d, cls.kernel1d) + cls.ctx = ocl.create_context() + cls.tol = { + "1D": 1e-4, + "2D": 1e-3, + "3D": 1e-3, + } + + @classmethod + def tearDownClass(cls): + cls.data1d = cls.data2d = cls.data3d = cls.image = None + cls.kernel1d = cls.kernel2d = cls.kernel3d = None + + @staticmethod + def compare(arr1, arr2): + return np.max(np.abs(arr1 - arr2)) + + @staticmethod + def print_err(conv): + errmsg = str( + """ + Something wrong with %s + mode=%s, texture=%s + """ + % (conv.use_case_desc, conv.mode, conv.use_textures) + ) + return errmsg + + def instantiate_convol(self, shape, kernel, axes=None): + if self.mode == "constant": + if not (self.param["use_textures"]) or ( + self.param["use_textures"] + and not (check_textures_availability(self.ctx)) + ): + pytest.skip("mode=constant not implemented without textures") + C = Convolution( + shape, + kernel, + mode=self.mode, + ctx=self.ctx, + axes=axes, + extra_options={"dont_use_textures": not (self.param["use_textures"])}, + ) + return C + + def get_data_and_kernel(self, test_name): + dims = { + "test_1D": (1, 1), + "test_separable_2D": (2, 1), + "test_separable_3D": (3, 1), + "test_nonseparable_2D": (2, 2), + "test_nonseparable_3D": (3, 3), + } + dim_data = {1: self.data1d, 2: self.data2d, 3: self.data3d} + dim_kernel = { + 1: self.kernel1d, + 2: self.kernel2d, + 3: self.kernel3d, + } + dd, kd = dims[test_name] + return dim_data[dd], dim_kernel[kd] + + def get_reference_function(self, test_name): + ref_func = { + "test_1D": lambda x, y: scipy_convolve1d(x, y, mode=self.mode), + "test_separable_2D": lambda x, y: scipy_convolve1d( + scipy_convolve1d(x, y, mode=self.mode, axis=1), + y, + mode=self.mode, + axis=0, + ), + "test_separable_3D": lambda x, y: scipy_convolve1d( + scipy_convolve1d( + scipy_convolve1d(x, y, mode=self.mode, axis=2), + y, + mode=self.mode, + axis=1, + ), + y, + mode=self.mode, + axis=0, + ), + "test_nonseparable_2D": lambda x, y: scipy_convolve(x, y, mode=self.mode), + "test_nonseparable_3D": lambda x, y: scipy_convolve(x, y, mode=self.mode), + } + return ref_func[test_name] + + def template_test(self, test_name): + data, kernel = self.get_data_and_kernel(test_name) + conv = self.instantiate_convol(data.shape, kernel) + if self.param["input_on_device"]: + data_ref = parray.to_device(conv.queue, data) + else: + data_ref = data + if self.param["output_on_device"]: + d_res = parray.empty_like(conv.data_out) + d_res.fill(0) + res = d_res + else: + res = None + res = conv(data_ref, output=res) + if self.param["output_on_device"]: + res = res.get() + ref_func = self.get_reference_function(test_name) + ref = ref_func(data, kernel) + metric = self.compare(res, ref) + logger.info("%s: max error = %.2e" % (test_name, metric)) + tol = self.tol[str("%dD" % kernel.ndim)] + assert metric < tol, self.print_err(conv) + + +def convolution_data_params(): + boundary_handlings = ["reflect", "nearest", "wrap", "constant"] + use_textures = [True, False] + input_on_devices = [True, False] + output_on_devices = [True, False] + param_vals = list( + product(boundary_handlings, use_textures, input_on_devices, output_on_devices) + ) + params = [] + for boundary_handling, use_texture, input_dev, output_dev in param_vals: + param={ + "boundary_handling": boundary_handling, + "input_on_device": input_dev, + "output_on_device": output_dev, + "use_textures": use_texture, + } + params.append(param) + + return params + + +@pytest.fixture(scope="module", params=convolution_data_params()) +def convolution_data(request): + """Provide a set of convolution data + + The module scope allows to test each function during a single setup of each + convolution data + """ + cdata = None + try: + cdata = ConvolutionData(request.param) + cdata.setUpClass() + yield cdata + finally: + cdata.tearDownClass() + + +@pytest.mark.skipif(ocl is None, reason="OpenCL is missing") +@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing") +def test_1D(convolution_data): + convolution_data.template_test("test_1D") + +@pytest.mark.skipif(ocl is None, reason="OpenCL is missing") +@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing") +def test_separable_2D(convolution_data): + convolution_data.template_test("test_separable_2D") + +@pytest.mark.skipif(ocl is None, reason="OpenCL is missing") +@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing") +def test_separable_3D(convolution_data): + convolution_data.template_test("test_separable_3D") + +@pytest.mark.skipif(ocl is None, reason="OpenCL is missing") +@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing") +def test_nonseparable_2D(convolution_data): + convolution_data.template_test("test_nonseparable_2D") + +@pytest.mark.skipif(ocl is None, reason="OpenCL is missing") +@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing") +def test_nonseparable_3D(convolution_data): + convolution_data.template_test("test_nonseparable_3D") + +@pytest.mark.skipif(ocl is None, reason="OpenCL is missing") +@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing") +def test_batched_2D(convolution_data): + """ + Test batched (nonseparable) 2D convolution on 3D data. + In this test: batch along "z" (axis 0) + """ + data = convolution_data.data3d + kernel = convolution_data.kernel2d + conv = convolution_data.instantiate_convol(data.shape, kernel, axes=(0,)) + res = conv(data) # 3D + ref = scipy_convolve(data[0], kernel, mode=convolution_data.mode) # 2D + + std = np.std(res, axis=0) + std_max = np.max(np.abs(std)) + assert std_max < convolution_data.tol["2D"], convolution_data.print_err(conv) + metric = convolution_data.compare(res[0], ref) + logger.info("test_nonseparable_3D: max error = %.2e" % metric) + assert metric < convolution_data.tol["2D"], convolution_data.print_err(conv) diff --git a/src/silx/opencl/test/test_doubleword.py b/src/silx/opencl/test/test_doubleword.py new file mode 100644 index 0000000..a33cf5a --- /dev/null +++ b/src/silx/opencl/test/test_doubleword.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python +# coding: utf-8 +# +# Project: The silx project +# https://github.com/silx-kit/silx +# +# Copyright (C) 2021-2021 European Synchrotron Radiation Facility, Grenoble, France +# +# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +"test suite for OpenCL code" + +__author__ = "Jérôme Kieffer" +__contact__ = "Jerome.Kieffer@ESRF.eu" +__license__ = "MIT" +__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "31/05/2021" + +import unittest +import numpy +import logging +import platform + +logger = logging.getLogger(__name__) +try: + import pyopencl +except ImportError as error: + logger.warning("OpenCL module (pyopencl) is not present, skip tests. %s.", error) + pyopencl = None + +from .. import ocl +if ocl is not None: + from ..utils import read_cl_file + from .. import pyopencl + import pyopencl.array + from pyopencl.elementwise import ElementwiseKernel + +EPS32 = numpy.finfo("float32").eps +EPS64 = numpy.finfo("float64").eps + + +@unittest.skipUnless(ocl, "PyOpenCl is missing") +class TestDoubleWord(unittest.TestCase): + """ + Test the kernels for compensated math in OpenCL + """ + + @classmethod + def setUpClass(cls): + if pyopencl is None or ocl is None: + raise unittest.SkipTest("OpenCL module (pyopencl) is not present or no device available") + + cls.ctx = ocl.create_context(devicetype="GPU") + cls.queue = pyopencl.CommandQueue(cls.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE) + + # this is running 32 bits OpenCL woth POCL + if (platform.machine() in ("i386", "i686", "x86_64") and (tuple.__itemsize__ == 4) and + cls.ctx.devices[0].platform.name == 'Portable Computing Language'): + cls.args = "-DX87_VOLATILE=volatile" + else: + cls.args = "" + size = 1024 + cls.a = 1.0 + numpy.random.random(size) + cls.b = 1.0 + numpy.random.random(size) + cls.ah = cls.a.astype(numpy.float32) + cls.bh = cls.b.astype(numpy.float32) + cls.al = (cls.a - cls.ah).astype(numpy.float32) + cls.bl = (cls.b - cls.bh).astype(numpy.float32) + cls.doubleword = read_cl_file("doubleword.cl") + + @classmethod + def tearDownClass(cls): + cls.queue = None + cls.ctx = None + cls.a = cls.al = cls.ah = None + cls.b = cls.bl = cls.bh = None + cls.doubleword = None + + def test_fast_sum2(self): + test_kernel = ElementwiseKernel(self.ctx, + "float *a, float *b, float *res_h, float *res_l", + "float2 tmp = fast_fp_plus_fp(a[i], b[i]); res_h[i] = tmp.s0; res_l[i] = tmp.s1", + preamble=self.doubleword) + a_g = pyopencl.array.to_device(self.queue, self.ah) + b_g = pyopencl.array.to_device(self.queue, self.bl) + res_l = pyopencl.array.empty_like(a_g) + res_h = pyopencl.array.empty_like(a_g) + test_kernel(a_g, b_g, res_h, res_l) + self.assertEqual(abs(self.ah + self.bl - res_h.get()).max(), 0, "Major matches") + self.assertGreater(abs(self.ah.astype(numpy.float64) + self.bl - res_h.get()).max(), 0, "Exact mismatches") + self.assertEqual(abs(self.ah.astype(numpy.float64) + self.bl - (res_h.get().astype(numpy.float64) + res_l.get())).max(), 0, "Exact matches") + + def test_sum2(self): + test_kernel = ElementwiseKernel(self.ctx, + "float *a, float *b, float *res_h, float *res_l", + "float2 tmp = fp_plus_fp(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;", + preamble=self.doubleword) + a_g = pyopencl.array.to_device(self.queue, self.ah) + b_g = pyopencl.array.to_device(self.queue, self.bh) + res_l = pyopencl.array.empty_like(a_g) + res_h = pyopencl.array.empty_like(a_g) + test_kernel(a_g, b_g, res_h, res_l) + self.assertEqual(abs(self.ah + self.bh - res_h.get()).max(), 0, "Major matches") + self.assertGreater(abs(self.ah.astype(numpy.float64) + self.bh - res_h.get()).max(), 0, "Exact mismatches") + self.assertEqual(abs(self.ah.astype(numpy.float64) + self.bh - (res_h.get().astype(numpy.float64) + res_l.get())).max(), 0, "Exact matches") + + def test_prod2(self): + test_kernel = ElementwiseKernel(self.ctx, + "float *a, float *b, float *res_h, float *res_l", + "float2 tmp = fp_times_fp(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;", + preamble=self.doubleword) + a_g = pyopencl.array.to_device(self.queue, self.ah) + b_g = pyopencl.array.to_device(self.queue, self.bh) + res_l = pyopencl.array.empty_like(a_g) + res_h = pyopencl.array.empty_like(a_g) + test_kernel(a_g, b_g, res_h, res_l) + res_m = res_h.get() + res = res_h.get().astype(numpy.float64) + res_l.get() + self.assertEqual(abs(self.ah * self.bh - res_m).max(), 0, "Major matches") + self.assertGreater(abs(self.ah.astype(numpy.float64) * self.bh - res_m).max(), 0, "Exact mismatches") + self.assertEqual(abs(self.ah.astype(numpy.float64) * self.bh - res).max(), 0, "Exact matches") + + def test_dw_plus_fp(self): + test_kernel = ElementwiseKernel(self.ctx, + "float *ah, float *al, float *b, float *res_h, float *res_l", + "float2 tmp = dw_plus_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;", + preamble=self.doubleword) + ah_g = pyopencl.array.to_device(self.queue, self.ah) + al_g = pyopencl.array.to_device(self.queue, self.al) + b_g = pyopencl.array.to_device(self.queue, self.bh) + res_l = pyopencl.array.empty_like(b_g) + res_h = pyopencl.array.empty_like(b_g) + test_kernel(ah_g, al_g, b_g, res_h, res_l) + res_m = res_h.get() + res = res_h.get().astype(numpy.float64) + res_l.get() + self.assertLess(abs(self.a + self.bh - res_m).max(), EPS32, "Major matches") + self.assertGreater(abs(self.a + self.bh - res_m).max(), EPS64, "Exact mismatches") + self.assertLess(abs(self.ah.astype(numpy.float64) + self.al + self.bh - res).max(), 2 * EPS32 ** 2, "Exact matches") + + def test_dw_plus_dw(self): + test_kernel = ElementwiseKernel(self.ctx, + "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l", + "float2 tmp = dw_plus_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;", + preamble=self.doubleword) + ah_g = pyopencl.array.to_device(self.queue, self.ah) + al_g = pyopencl.array.to_device(self.queue, self.al) + bh_g = pyopencl.array.to_device(self.queue, self.bh) + bl_g = pyopencl.array.to_device(self.queue, self.bl) + res_l = pyopencl.array.empty_like(bh_g) + res_h = pyopencl.array.empty_like(bh_g) + test_kernel(ah_g, al_g, bh_g, bl_g, res_h, res_l) + res_m = res_h.get() + res = res_h.get().astype(numpy.float64) + res_l.get() + self.assertLess(abs(self.a + self.b - res_m).max(), EPS32, "Major matches") + self.assertGreater(abs(self.a + self.b - res_m).max(), EPS64, "Exact mismatches") + self.assertLess(abs(self.a + self.b - res).max(), 3 * EPS32 ** 2, "Exact matches") + + def test_dw_times_fp(self): + test_kernel = ElementwiseKernel(self.ctx, + "float *ah, float *al, float *b, float *res_h, float *res_l", + "float2 tmp = dw_times_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;", + preamble=self.doubleword) + ah_g = pyopencl.array.to_device(self.queue, self.ah) + al_g = pyopencl.array.to_device(self.queue, self.al) + b_g = pyopencl.array.to_device(self.queue, self.bh) + res_l = pyopencl.array.empty_like(b_g) + res_h = pyopencl.array.empty_like(b_g) + test_kernel(ah_g, al_g, b_g, res_h, res_l) + res_m = res_h.get() + res = res_h.get().astype(numpy.float64) + res_l.get() + self.assertLess(abs(self.a * self.bh - res_m).max(), EPS32, "Major matches") + self.assertGreater(abs(self.a * self.bh - res_m).max(), EPS64, "Exact mismatches") + self.assertLess(abs(self.a * self.bh - res).max(), 2 * EPS32 ** 2, "Exact matches") + + def test_dw_times_dw(self): + test_kernel = ElementwiseKernel(self.ctx, + "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l", + "float2 tmp = dw_times_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;", + preamble=self.doubleword) + ah_g = pyopencl.array.to_device(self.queue, self.ah) + al_g = pyopencl.array.to_device(self.queue, self.al) + bh_g = pyopencl.array.to_device(self.queue, self.bh) + bl_g = pyopencl.array.to_device(self.queue, self.bl) + res_l = pyopencl.array.empty_like(bh_g) + res_h = pyopencl.array.empty_like(bh_g) + test_kernel(ah_g, al_g, bh_g, bl_g, res_h, res_l) + res_m = res_h.get() + res = res_h.get().astype(numpy.float64) + res_l.get() + self.assertLess(abs(self.a * self.b - res_m).max(), EPS32, "Major matches") + self.assertGreater(abs(self.a * self.b - res_m).max(), EPS64, "Exact mismatches") + self.assertLess(abs(self.a * self.b - res).max(), 5 * EPS32 ** 2, "Exact matches") + + def test_dw_div_fp(self): + test_kernel = ElementwiseKernel(self.ctx, + "float *ah, float *al, float *b, float *res_h, float *res_l", + "float2 tmp = dw_div_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;", + preamble=self.doubleword) + ah_g = pyopencl.array.to_device(self.queue, self.ah) + al_g = pyopencl.array.to_device(self.queue, self.al) + b_g = pyopencl.array.to_device(self.queue, self.bh) + res_l = pyopencl.array.empty_like(b_g) + res_h = pyopencl.array.empty_like(b_g) + test_kernel(ah_g, al_g, b_g, res_h, res_l) + res_m = res_h.get() + res = res_h.get().astype(numpy.float64) + res_l.get() + self.assertLess(abs(self.a / self.bh - res_m).max(), EPS32, "Major matches") + self.assertGreater(abs(self.a / self.bh - res_m).max(), EPS64, "Exact mismatches") + self.assertLess(abs(self.a / self.bh - res).max(), 3 * EPS32 ** 2, "Exact matches") + + def test_dw_div_dw(self): + test_kernel = ElementwiseKernel(self.ctx, + "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l", + "float2 tmp = dw_div_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;", + preamble=self.doubleword) + ah_g = pyopencl.array.to_device(self.queue, self.ah) + al_g = pyopencl.array.to_device(self.queue, self.al) + bh_g = pyopencl.array.to_device(self.queue, self.bh) + bl_g = pyopencl.array.to_device(self.queue, self.bl) + res_l = pyopencl.array.empty_like(bh_g) + res_h = pyopencl.array.empty_like(bh_g) + test_kernel(ah_g, al_g, bh_g, bl_g, res_h, res_l) + res_m = res_h.get() + res = res_h.get().astype(numpy.float64) + res_l.get() + self.assertLess(abs(self.a / self.b - res_m).max(), EPS32, "Major matches") + self.assertGreater(abs(self.a / self.b - res_m).max(), EPS64, "Exact mismatches") + self.assertLess(abs(self.a / self.b - res).max(), 6 * EPS32 ** 2, "Exact matches") diff --git a/src/silx/opencl/test/test_image.py b/src/silx/opencl/test/test_image.py new file mode 100644 index 0000000..73c771b --- /dev/null +++ b/src/silx/opencl/test/test_image.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Project: image manipulation in OpenCL +# https://github.com/silx-kit/silx +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +""" +Simple test of image manipulation +""" + +from __future__ import division, print_function + +__authors__ = ["Jérôme Kieffer"] +__contact__ = "jerome.kieffer@esrf.eu" +__license__ = "MIT" +__copyright__ = "2017 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "13/02/2018" + +import logging +import numpy + +import unittest +from ..common import ocl, _measure_workgroup_size +if ocl: + import pyopencl + import pyopencl.array +from ...test.utils import utilstest +from ..image import ImageProcessing +logger = logging.getLogger(__name__) +try: + from PIL import Image +except ImportError: + Image = None + + +@unittest.skipUnless(ocl and Image, "PyOpenCl/Image is missing") +class TestImage(unittest.TestCase): + + @classmethod + def setUpClass(cls): + super(TestImage, cls).setUpClass() + if ocl: + cls.ctx = ocl.create_context() + cls.lena = utilstest.getfile("lena.png") + cls.data = numpy.asarray(Image.open(cls.lena)) + cls.ip = ImageProcessing(ctx=cls.ctx, template=cls.data, profile=True) + + @classmethod + def tearDownClass(cls): + super(TestImage, cls).tearDownClass() + cls.ctx = None + cls.lena = None + cls.data = None + if logger.level <= logging.INFO: + logger.warning("\n".join(cls.ip.log_profile())) + cls.ip = None + + def setUp(self): + if ocl is None: + return + self.data = numpy.asarray(Image.open(self.lena)) + + def tearDown(self): + self.img = self.data = None + + @unittest.skipUnless(ocl, "pyopencl is missing") + def test_cast(self): + """ + tests the cast kernel + """ + res = self.ip.to_float(self.data) + self.assertEqual(res.shape, self.data.shape, "shape") + self.assertEqual(res.dtype, numpy.float32, "dtype") + self.assertEqual(abs(res - self.data).max(), 0, "content") + + @unittest.skipUnless(ocl, "pyopencl is missing") + def test_normalize(self): + """ + tests that all devices are working properly ... + """ + tmp = pyopencl.array.empty(self.ip.ctx, self.data.shape, "float32") + res = self.ip.to_float(self.data, out=tmp) + res2 = self.ip.normalize(tmp, -100, 100, copy=False) + norm = (self.data.astype(numpy.float32) - self.data.min()) / (self.data.max() - self.data.min()) + ref2 = 200 * norm - 100 + self.assertLess(abs(res2 - ref2).max(), 3e-5, "content") + + @unittest.skipUnless(ocl, "pyopencl is missing") + def test_histogram(self): + """ + Test on a greyscaled image ... of Lena :) + """ + lena_bw = (0.2126 * self.data[:, :, 0] + + 0.7152 * self.data[:, :, 1] + + 0.0722 * self.data[:, :, 2]).astype("int32") + ref = numpy.histogram(lena_bw, 255) + ip = ImageProcessing(ctx=self.ctx, template=lena_bw, profile=True) + res = ip.histogram(lena_bw, 255) + ip.log_profile() + delta = (ref[0] - res[0]) + deltap = (ref[1] - res[1]) + self.assertEqual(delta.sum(), 0, "errors are self-compensated") + self.assertLessEqual(abs(delta).max(), 1, "errors are small") + self.assertLessEqual(abs(deltap).max(), 3e-5, "errors on position are small: %s" % (abs(deltap).max())) diff --git a/src/silx/opencl/test/test_kahan.py b/src/silx/opencl/test/test_kahan.py new file mode 100644 index 0000000..9e4a1e3 --- /dev/null +++ b/src/silx/opencl/test/test_kahan.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python +# coding: utf-8 +# +# Project: OpenCL numerical library +# https://github.com/silx-kit/silx +# +# Copyright (C) 2015-2021 European Synchrotron Radiation Facility, Grenoble, France +# +# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +"test suite for OpenCL code" + +__author__ = "Jérôme Kieffer" +__contact__ = "Jerome.Kieffer@ESRF.eu" +__license__ = "MIT" +__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "17/05/2021" + + +import unittest +import numpy +import logging +import platform + +logger = logging.getLogger(__name__) +try: + import pyopencl +except ImportError as error: + logger.warning("OpenCL module (pyopencl) is not present, skip tests. %s.", error) + pyopencl = None + +from .. import ocl +if ocl is not None: + from ..utils import read_cl_file + from .. import pyopencl + import pyopencl.array + + +class TestKahan(unittest.TestCase): + """ + Test the kernels for compensated math in OpenCL + """ + + @classmethod + def setUpClass(cls): + if pyopencl is None or ocl is None: + raise unittest.SkipTest("OpenCL module (pyopencl) is not present or no device available") + + cls.ctx = ocl.create_context(devicetype="GPU") + cls.queue = pyopencl.CommandQueue(cls.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE) + + # this is running 32 bits OpenCL woth POCL + if (platform.machine() in ("i386", "i686", "x86_64") and (tuple.__itemsize__ == 4) and + cls.ctx.devices[0].platform.name == 'Portable Computing Language'): + cls.args = "-DX87_VOLATILE=volatile" + else: + cls.args = "" + + @classmethod + def tearDownClass(cls): + cls.queue = None + cls.ctx = None + + @staticmethod + def dummy_sum(ary, dtype=None): + "perform the actual sum in a dummy way " + if dtype is None: + dtype = ary.dtype.type + sum_ = dtype(0) + for i in ary: + sum_ += i + return sum_ + + def test_kahan(self): + # simple test + N = 26 + data = (1 << (N - 1 - numpy.arange(N))).astype(numpy.float32) + + ref64 = numpy.sum(data, dtype=numpy.float64) + ref32 = self.dummy_sum(data) + if (ref64 == ref32): + logger.warning("Kahan: invalid tests as float32 provides the same result as float64") + # Dummy kernel to evaluate + src = """ + kernel void summation(global float* data, + int size, + global float* result) + { + float2 acc = (float2)(0.0f, 0.0f); + for (int i=0; i<size; i++) + { + acc = kahan_sum(acc, data[i]); + } + result[0] = acc.s0; + result[1] = acc.s1; + } + """ + prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(self.args) + ones_d = pyopencl.array.to_device(self.queue, data) + res_d = pyopencl.array.empty(self.queue, 2, numpy.float32) + res_d.fill(0) + evt = prg.summation(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data) + evt.wait() + res = res_d.get().sum(dtype=numpy.float64) + self.assertEqual(ref64, res, "test_kahan") + + def test_dot16(self): + # simple test + N = 16 + data = (1 << (N - 1 - numpy.arange(N))).astype(numpy.float32) + + ref64 = numpy.dot(data.astype(numpy.float64), data.astype(numpy.float64)) + ref32 = numpy.dot(data, data) + if (ref64 == ref32): + logger.warning("dot16: invalid tests as float32 provides the same result as float64") + # Dummy kernel to evaluate + src = """ + kernel void test_dot16(global float* data, + int size, + global float* result) + { + float2 acc = (float2)(0.0f, 0.0f); + float16 data16 = (float16) (data[0],data[1],data[2],data[3],data[4], + data[5],data[6],data[7],data[8],data[9], + data[10],data[11],data[12],data[13],data[14],data[15]); + acc = comp_dot16(data16, data16); + result[0] = acc.s0; + result[1] = acc.s1; + } + + kernel void test_dot8(global float* data, + int size, + global float* result) + { + float2 acc = (float2)(0.0f, 0.0f); + float8 data0 = (float8) (data[0],data[2],data[4],data[6],data[8],data[10],data[12],data[14]); + float8 data1 = (float8) (data[1],data[3],data[5],data[7],data[9],data[11],data[13],data[15]); + acc = comp_dot8(data0, data1); + result[0] = acc.s0; + result[1] = acc.s1; + } + + kernel void test_dot4(global float* data, + int size, + global float* result) + { + float2 acc = (float2)(0.0f, 0.0f); + float4 data0 = (float4) (data[0],data[4],data[8],data[12]); + float4 data1 = (float4) (data[3],data[7],data[11],data[15]); + acc = comp_dot4(data0, data1); + result[0] = acc.s0; + result[1] = acc.s1; + } + + kernel void test_dot3(global float* data, + int size, + global float* result) + { + float2 acc = (float2)(0.0f, 0.0f); + float3 data0 = (float3) (data[0],data[4],data[12]); + float3 data1 = (float3) (data[3],data[11],data[15]); + acc = comp_dot3(data0, data1); + result[0] = acc.s0; + result[1] = acc.s1; + } + + kernel void test_dot2(global float* data, + int size, + global float* result) + { + float2 acc = (float2)(0.0f, 0.0f); + float2 data0 = (float2) (data[0],data[14]); + float2 data1 = (float2) (data[1],data[15]); + acc = comp_dot2(data0, data1); + result[0] = acc.s0; + result[1] = acc.s1; + } + + """ + + prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(self.args) + ones_d = pyopencl.array.to_device(self.queue, data) + res_d = pyopencl.array.empty(self.queue, 2, numpy.float32) + res_d.fill(0) + evt = prg.test_dot16(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data) + evt.wait() + res = res_d.get().sum(dtype="float64") + self.assertEqual(ref64, res, "test_dot16") + + res_d.fill(0) + data0 = data[0::2] + data1 = data[1::2] + ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64)) + ref32 = numpy.dot(data0, data1) + if (ref64 == ref32): + logger.warning("dot8: invalid tests as float32 provides the same result as float64") + evt = prg.test_dot8(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data) + evt.wait() + res = res_d.get().sum(dtype="float64") + self.assertEqual(ref64, res, "test_dot8") + + res_d.fill(0) + data0 = data[0::4] + data1 = data[3::4] + ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64)) + ref32 = numpy.dot(data0, data1) + if (ref64 == ref32): + logger.warning("dot4: invalid tests as float32 provides the same result as float64") + evt = prg.test_dot4(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data) + evt.wait() + res = res_d.get().sum(dtype="float64") + self.assertEqual(ref64, res, "test_dot4") + + res_d.fill(0) + data0 = numpy.array([data[0], data[4], data[12]]) + data1 = numpy.array([data[3], data[11], data[15]]) + ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64)) + ref32 = numpy.dot(data0, data1) + if (ref64 == ref32): + logger.warning("dot3: invalid tests as float32 provides the same result as float64") + evt = prg.test_dot3(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data) + evt.wait() + res = res_d.get().sum(dtype="float64") + self.assertEqual(ref64, res, "test_dot3") + + res_d.fill(0) + data0 = numpy.array([data[0], data[14]]) + data1 = numpy.array([data[1], data[15]]) + ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64)) + ref32 = numpy.dot(data0, data1) + if (ref64 == ref32): + logger.warning("dot2: invalid tests as float32 provides the same result as float64") + evt = prg.test_dot2(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data) + evt.wait() + res = res_d.get().sum(dtype="float64") + self.assertEqual(ref64, res, "test_dot2") diff --git a/src/silx/opencl/test/test_linalg.py b/src/silx/opencl/test/test_linalg.py new file mode 100644 index 0000000..a997a36 --- /dev/null +++ b/src/silx/opencl/test/test_linalg.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2016 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Test of the linalg module""" + +from __future__ import division, print_function + +__authors__ = ["Pierre paleo"] +__license__ = "MIT" +__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "01/08/2019" + + +import time +import logging +import numpy as np +import unittest +try: + import mako +except ImportError: + mako = None +from ..common import ocl +if ocl: + import pyopencl as cl + import pyopencl.array as parray + from .. import linalg +from silx.test.utils import utilstest + +logger = logging.getLogger(__name__) +try: + from scipy.ndimage.filters import laplace + _has_scipy = True +except ImportError: + _has_scipy = False + + +# TODO move this function in math or image ? +def gradient(img): + ''' + Compute the gradient of an image as a numpy array + Code from https://github.com/emmanuelle/tomo-tv/ + ''' + shape = [img.ndim, ] + list(img.shape) + gradient = np.zeros(shape, dtype=img.dtype) + slice_all = [0, slice(None, -1),] + for d in range(img.ndim): + gradient[tuple(slice_all)] = np.diff(img, axis=d) + slice_all[0] = d + 1 + slice_all.insert(1, slice(None)) + return gradient + + +# TODO move this function in math or image ? +def divergence(grad): + ''' + Compute the divergence of a gradient + Code from https://github.com/emmanuelle/tomo-tv/ + ''' + res = np.zeros(grad.shape[1:]) + for d in range(grad.shape[0]): + this_grad = np.rollaxis(grad[d], d) + this_res = np.rollaxis(res, d) + this_res[:-1] += this_grad[:-1] + this_res[1:-1] -= this_grad[:-2] + this_res[-1] -= this_grad[-2] + return res + + +@unittest.skipUnless(ocl and mako, "PyOpenCl is missing") +class TestLinAlg(unittest.TestCase): + + def setUp(self): + if ocl is None: + return + self.getfiles() + self.la = linalg.LinAlg(self.image.shape) + self.allocate_arrays() + + def allocate_arrays(self): + """ + Allocate various types of arrays for the tests + """ + # numpy images + self.grad = np.zeros(self.image.shape, dtype=np.complex64) + self.grad2 = np.zeros((2,) + self.image.shape, dtype=np.float32) + self.grad_ref = gradient(self.image) + self.div_ref = divergence(self.grad_ref) + self.image2 = np.zeros_like(self.image) + # Device images + self.gradient_parray = parray.empty(self.la.queue, self.image.shape, np.complex64) + self.gradient_parray.fill(0) + # we should be using cl.Buffer(self.la.ctx, cl.mem_flags.READ_WRITE, size=self.image.nbytes*2), + # but platforms not suporting openCL 1.2 have a problem with enqueue_fill_buffer, + # so we use the parray "fill" utility + self.gradient_buffer = self.gradient_parray.data + # Do the same for image + self.image_parray = parray.to_device(self.la.queue, self.image) + self.image_buffer = self.image_parray.data + # Refs + tmp = np.zeros(self.image.shape, dtype=np.complex64) + tmp.real = np.copy(self.grad_ref[0]) + tmp.imag = np.copy(self.grad_ref[1]) + self.grad_ref_parray = parray.to_device(self.la.queue, tmp) + self.grad_ref_buffer = self.grad_ref_parray.data + + def tearDown(self): + self.image = None + self.image2 = None + self.grad = None + self.grad2 = None + self.grad_ref = None + self.div_ref = None + self.gradient_parray.data.release() + self.gradient_parray = None + self.gradient_buffer = None + self.image_parray.data.release() + self.image_parray = None + self.image_buffer = None + self.grad_ref_parray.data.release() + self.grad_ref_parray = None + self.grad_ref_buffer = None + + def getfiles(self): + # load 512x512 MRI phantom - TODO include Lena or ascent once a .npz is available + self.image = np.load(utilstest.getfile("Brain512.npz"))["data"] + + def compare(self, result, reference, abstol, name): + errmax = np.max(np.abs(result - reference)) + logger.info("%s: Max error = %e" % (name, errmax)) + self.assertTrue(errmax < abstol, str("%s: Max error is too high" % name)) + + @unittest.skipUnless(ocl and mako, "pyopencl is missing") + def test_gradient(self): + arrays = { + "numpy.ndarray": self.image, + "buffer": self.image_buffer, + "parray": self.image_parray + } + for desc, image in arrays.items(): + # Test with dst on host (numpy.ndarray) + res = self.la.gradient(image, return_to_host=True) + self.compare(res, self.grad_ref, 1e-6, str("gradient[src=%s, dst=numpy.ndarray]" % desc)) + # Test with dst on device (pyopencl.Buffer) + self.la.gradient(image, dst=self.gradient_buffer) + cl.enqueue_copy(self.la.queue, self.grad, self.gradient_buffer) + self.grad2[0] = self.grad.real + self.grad2[1] = self.grad.imag + self.compare(self.grad2, self.grad_ref, 1e-6, str("gradient[src=%s, dst=buffer]" % desc)) + # Test with dst on device (pyopencl.Array) + self.la.gradient(image, dst=self.gradient_parray) + self.grad = self.gradient_parray.get() + self.grad2[0] = self.grad.real + self.grad2[1] = self.grad.imag + self.compare(self.grad2, self.grad_ref, 1e-6, str("gradient[src=%s, dst=parray]" % desc)) + + @unittest.skipUnless(ocl and mako, "pyopencl is missing") + def test_divergence(self): + arrays = { + "numpy.ndarray": self.grad_ref, + "buffer": self.grad_ref_buffer, + "parray": self.grad_ref_parray + } + for desc, grad in arrays.items(): + # Test with dst on host (numpy.ndarray) + res = self.la.divergence(grad, return_to_host=True) + self.compare(res, self.div_ref, 1e-6, str("divergence[src=%s, dst=numpy.ndarray]" % desc)) + # Test with dst on device (pyopencl.Buffer) + self.la.divergence(grad, dst=self.image_buffer) + cl.enqueue_copy(self.la.queue, self.image2, self.image_buffer) + self.compare(self.image2, self.div_ref, 1e-6, str("divergence[src=%s, dst=buffer]" % desc)) + # Test with dst on device (pyopencl.Array) + self.la.divergence(grad, dst=self.image_parray) + self.image2 = self.image_parray.get() + self.compare(self.image2, self.div_ref, 1e-6, str("divergence[src=%s, dst=parray]" % desc)) + + @unittest.skipUnless(ocl and mako and _has_scipy, "pyopencl and/or scipy is missing") + def test_laplacian(self): + laplacian_ref = laplace(self.image) + # Laplacian = div(grad) + self.la.gradient(self.image) + laplacian_ocl = self.la.divergence(self.la.d_gradient, return_to_host=True) + self.compare(laplacian_ocl, laplacian_ref, 1e-6, "laplacian") diff --git a/src/silx/opencl/test/test_medfilt.py b/src/silx/opencl/test/test_medfilt.py new file mode 100644 index 0000000..339e0f2 --- /dev/null +++ b/src/silx/opencl/test/test_medfilt.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Project: Median filter of images + OpenCL +# https://github.com/silx-kit/silx +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +""" +Simple test of the median filter +""" + +from __future__ import division, print_function + +__authors__ = ["Jérôme Kieffer"] +__contact__ = "jerome.kieffer@esrf.eu" +__license__ = "MIT" +__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "05/07/2018" + + +import sys +import time +import logging +import numpy +import unittest +from collections import namedtuple +try: + import mako +except ImportError: + mako = None +from ..common import ocl +if ocl: + import pyopencl + import pyopencl.array + from .. import medfilt + +logger = logging.getLogger(__name__) + +Result = namedtuple("Result", ["size", "error", "sp_time", "oc_time"]) + +try: + from scipy.misc import ascent +except: + def ascent(): + """Dummy image from random data""" + return numpy.random.random((512, 512)) +try: + from scipy.ndimage import filters + median_filter = filters.median_filter + HAS_SCIPY = True +except: + HAS_SCIPY = False + from silx.math import medfilt2d as median_filter + +@unittest.skipUnless(ocl and mako, "PyOpenCl is missing") +class TestMedianFilter(unittest.TestCase): + + def setUp(self): + if ocl is None: + return + self.data = ascent().astype(numpy.float32) + self.medianfilter = medfilt.MedianFilter2D(self.data.shape, devicetype="gpu") + + def tearDown(self): + self.data = None + self.medianfilter = None + + def measure(self, size): + "Common measurement of accuracy and timings" + t0 = time.time() + if HAS_SCIPY: + ref = median_filter(self.data, size, mode="nearest") + else: + ref = median_filter(self.data, size) + t1 = time.time() + try: + got = self.medianfilter.medfilt2d(self.data, size) + except RuntimeError as msg: + logger.error(msg) + return + t2 = time.time() + delta = abs(got - ref).max() + return Result(size, delta, t1 - t0, t2 - t1) + + @unittest.skipUnless(ocl and mako, "pyopencl is missing") + def test_medfilt(self): + """ + tests the median filter kernel + """ + r = self.measure(size=11) + if r is None: + logger.info("test_medfilt: size: %s: skipped") + else: + logger.info("test_medfilt: size: %s error %s, t_ref: %.3fs, t_ocl: %.3fs" % r) + self.assertEqual(r.error, 0, 'Results are correct') + + def benchmark(self, limit=36): + "Run some benchmarking" + try: + import PyQt5 + from ...gui.matplotlib import pylab + from ...gui.utils import update_fig + except: + pylab = None + + def update_fig(*ag, **kwarg): + pass + + fig = pylab.figure() + fig.suptitle("Median filter of an image 512x512") + sp = fig.add_subplot(1, 1, 1) + sp.set_title(self.medianfilter.ctx.devices[0].name) + sp.set_xlabel("Window width & height") + sp.set_ylabel("Execution time (s)") + sp.set_xlim(2, limit + 1) + sp.set_ylim(0, 4) + data_size = [] + data_scipy = [] + data_opencl = [] + plot_sp = sp.plot(data_size, data_scipy, "-or", label="scipy")[0] + plot_opencl = sp.plot(data_size, data_opencl, "-ob", label="opencl")[0] + sp.legend(loc=2) + fig.show() + update_fig(fig) + for s in range(3, limit, 2): + r = self.measure(s) + print(r) + if r.error == 0: + data_size.append(s) + data_scipy.append(r.sp_time) + data_opencl.append(r.oc_time) + plot_sp.set_data(data_size, data_scipy) + plot_opencl.set_data(data_size, data_opencl) + update_fig(fig) + fig.show() + input() + + +def benchmark(): + testSuite = unittest.TestSuite() + testSuite.addTest(TestMedianFilter("benchmark")) + return testSuite diff --git a/src/silx/opencl/test/test_projection.py b/src/silx/opencl/test/test_projection.py new file mode 100644 index 0000000..13db5f4 --- /dev/null +++ b/src/silx/opencl/test/test_projection.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2016 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Test of the forward projection module""" + +from __future__ import division, print_function + +__authors__ = ["Pierre paleo"] +__license__ = "MIT" +__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "19/01/2018" + + +import time +import logging +import numpy as np +import unittest +try: + import mako +except ImportError: + mako = None +from ..common import ocl +if ocl: + from .. import projection +from silx.test.utils import utilstest + +logger = logging.getLogger(__name__) + + +@unittest.skipUnless(ocl and mako, "PyOpenCl is missing") +class TestProj(unittest.TestCase): + + def setUp(self): + if ocl is None: + return + # ~ if sys.platform.startswith('darwin'): + # ~ self.skipTest("Projection is not implemented on CPU for OS X yet") + self.getfiles() + n_angles = self.sino.shape[0] + self.proj = projection.Projection(self.phantom.shape, n_angles) + if self.proj.compiletime_workgroup_size < 16 * 16: + self.skipTest("Current implementation of OpenCL projection is not supported on this platform yet") + + def tearDown(self): + self.phantom = None + self.sino = None + self.proj = None + + def getfiles(self): + # load 512x512 MRI phantom + self.phantom = np.load(utilstest.getfile("Brain512.npz"))["data"] + # load sinogram computed with PyHST + self.sino = np.load(utilstest.getfile("sino500_pyhst.npz"))["data"] + + def measure(self): + "Common measurement of timings" + t1 = time.time() + try: + result = self.proj.projection(self.phantom) + except RuntimeError as msg: + logger.error(msg) + return + t2 = time.time() + return t2 - t1, result + + def compare(self, res): + """ + Compare a result with the reference reconstruction. + Only the valid reconstruction zone (inscribed circle) is taken into account + """ + # Compare with the original phantom. + # TODO: compare a standard projection + ref = self.sino + return np.max(np.abs(res - ref)) + + @unittest.skipUnless(ocl and mako, "pyopencl is missing") + def test_proj(self): + """ + tests Projection + """ + # Test single reconstruction + # -------------------------- + t, res = self.measure() + if t is None: + logger.info("test_proj: skipped") + else: + logger.info("test_proj: time = %.3fs" % t) + err = self.compare(res) + msg = str("Max error = %e" % err) + logger.info(msg) + # Interpolation differs at some lines, giving relative error of 10/50000 + self.assertTrue(err < 20., "Max error is too high") + # Test multiple reconstructions + # ----------------------------- + res0 = np.copy(res) + for i in range(10): + res = self.proj.projection(self.phantom) + errmax = np.max(np.abs(res - res0)) + self.assertTrue(errmax < 1.e-6, "Max error is too high") diff --git a/src/silx/opencl/test/test_sparse.py b/src/silx/opencl/test/test_sparse.py new file mode 100644 index 0000000..1d26b36 --- /dev/null +++ b/src/silx/opencl/test/test_sparse.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python +# coding: utf-8 +# /*########################################################################## +# +# Copyright (c) 2018-2019 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ###########################################################################*/ +"""Test of the sparse module""" + +import numpy as np +import unittest +import logging +from itertools import product +from ..common import ocl +if ocl: + import pyopencl.array as parray + from silx.opencl.sparse import CSR +try: + import scipy.sparse as sp +except ImportError: + sp = None +logger = logging.getLogger(__name__) + + + +def generate_sparse_random_data( + shape=(1000,), + data_min=0, data_max=100, + density=0.1, + use_only_integers=True, + dtype="f"): + """ + Generate random sparse data where. + + Parameters + ------------ + shape: tuple + Output data shape. + data_min: int or float + Minimum value of data + data_max: int or float + Maximum value of data + density: float + Density of non-zero elements in the output data. + Low value of density mean low number of non-zero elements. + use_only_integers: bool + If set to True, the output data items will be primarily integers, + possibly casted to float if dtype is a floating-point type. + This can be used for ease of debugging. + dtype: str or numpy.dtype + Output data type + """ + mask = np.random.binomial(1, density, size=shape) + if use_only_integers: + d = np.random.randint(data_min, high=data_max, size=shape) + else: + d = data_min + (data_max - data_min) * np.random.rand(*shape) + return (d * mask).astype(dtype) + + + +@unittest.skipUnless(ocl and sp, "PyOpenCl/scipy is missing") +class TestCSR(unittest.TestCase): + """Test CSR format""" + + def setUp(self): + # Test possible configurations + input_on_device = [False, True] + output_on_device = [False, True] + dtypes = [np.float32, np.int32, np.uint16] + self._test_configs = list(product(input_on_device, output_on_device, dtypes)) + + + def compute_ref_sparsification(self, array): + ref_sparse = sp.csr_matrix(array) + return ref_sparse + + + def test_sparsification(self): + for input_on_device, output_on_device, dtype in self._test_configs: + self._test_sparsification(input_on_device, output_on_device, dtype) + + + def _test_sparsification(self, input_on_device, output_on_device, dtype): + current_config = "input on device: %s, output on device: %s, dtype: %s" % ( + str(input_on_device), str(output_on_device), str(dtype) + ) + logger.debug("CSR: %s" % current_config) + # Generate data and reference CSR + array = generate_sparse_random_data(shape=(512, 511), dtype=dtype) + ref_sparse = self.compute_ref_sparsification(array) + # Sparsify on device + csr = CSR(array.shape, dtype=dtype) + if input_on_device: + # The array has to be flattened + arr = parray.to_device(csr.queue, array.ravel()) + else: + arr = array + if output_on_device: + d_data = parray.empty_like(csr.data) + d_indices = parray.empty_like(csr.indices) + d_indptr = parray.empty_like(csr.indptr) + d_data.fill(0) + d_indices.fill(0) + d_indptr.fill(0) + output = (d_data, d_indices, d_indptr) + else: + output = None + data, indices, indptr = csr.sparsify(arr, output=output) + if output_on_device: + data = data.get() + indices = indices.get() + indptr = indptr.get() + # Compare + nnz = ref_sparse.nnz + self.assertTrue( + np.allclose(data[:nnz], ref_sparse.data), + "something wrong with sparsified data (%s)" + % current_config + ) + self.assertTrue( + np.allclose(indices[:nnz], ref_sparse.indices), + "something wrong with sparsified indices (%s)" + % current_config + ) + self.assertTrue( + np.allclose(indptr, ref_sparse.indptr), + "something wrong with sparsified indices pointers (indptr) (%s)" + % current_config + ) + + + def test_desparsification(self): + for input_on_device, output_on_device, dtype in self._test_configs: + self._test_desparsification(input_on_device, output_on_device, dtype) + + + def _test_desparsification(self, input_on_device, output_on_device, dtype): + current_config = "input on device: %s, output on device: %s, dtype: %s" % ( + str(input_on_device), str(output_on_device), str(dtype) + ) + logger.debug("CSR: %s" % current_config) + # Generate data and reference CSR + array = generate_sparse_random_data(shape=(512, 511), dtype=dtype) + ref_sparse = self.compute_ref_sparsification(array) + # De-sparsify on device + csr = CSR(array.shape, dtype=dtype, max_nnz=ref_sparse.nnz) + if input_on_device: + data = parray.to_device(csr.queue, ref_sparse.data) + indices = parray.to_device(csr.queue, ref_sparse.indices) + indptr = parray.to_device(csr.queue, ref_sparse.indptr) + else: + data = ref_sparse.data + indices = ref_sparse.indices + indptr = ref_sparse.indptr + if output_on_device: + d_arr = parray.empty_like(csr.array) + d_arr.fill(0) + output = d_arr + else: + output = None + arr = csr.densify(data, indices, indptr, output=output) + if output_on_device: + arr = arr.get() + # Compare + self.assertTrue( + np.allclose(arr.reshape(array.shape), array), + "something wrong with densified data (%s)" + % current_config + ) diff --git a/src/silx/opencl/test/test_stats.py b/src/silx/opencl/test/test_stats.py new file mode 100644 index 0000000..859271d --- /dev/null +++ b/src/silx/opencl/test/test_stats.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Project: Sift implementation in Python + OpenCL +# https://github.com/silx-kit/silx +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +""" +Simple test of an addition +""" +__authors__ = ["Henri Payno, Jérôme Kieffer"] +__contact__ = "jerome.kieffer@esrf.eu" +__license__ = "MIT" +__copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "19/05/2021" + +import logging +import time +import numpy + +import unittest +from ..common import ocl +if ocl: + import pyopencl + import pyopencl.array + from ..statistics import StatResults, Statistics +from ..utils import get_opencl_code +logger = logging.getLogger(__name__) + + +@unittest.skipUnless(ocl, "PyOpenCl is missing") +class TestStatistics(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.size = 1 << 20 # 1 million elements + cls.data = numpy.random.randint(0, 65000, cls.size).astype("uint16") + fdata = cls.data.astype("float64") + t0 = time.perf_counter() + std = fdata.std() + cls.ref = StatResults(fdata.min(), fdata.max(), float(fdata.size), + fdata.sum(), fdata.mean(), std ** 2, + std) + t1 = time.perf_counter() + cls.ref_time = t1 - t0 + + @classmethod + def tearDownClass(cls): + cls.size = cls.ref = cls.data = cls.ref_time = None + + @classmethod + def validate(cls, res): + return ( + (res.min == cls.ref.min) and + (res.max == cls.ref.max) and + (res.cnt == cls.ref.cnt) and + abs(res.mean - cls.ref.mean) < 0.01 and + abs(res.std - cls.ref.std) < 0.1) + + def test_measurement(self): + """ + tests that all devices are working properly ... + """ + logger.info("Reference results: %s", self.ref) + for pid, platform in enumerate(ocl.platforms): + for did, device in enumerate(platform.devices): + try: + s = Statistics(template=self.data, platformid=pid, deviceid=did) + except Exception as err: + failed_init = True + res = StatResults(0, 0, 0, 0, 0, 0, 0) + print(err) + else: + failed_init = False + for comp in ("single", "double", "comp"): + t0 = time.perf_counter() + res = s(self.data, comp=comp) + t1 = time.perf_counter() + logger.info("Runtime on %s/%s : %.3fms x%.1f", platform, device, 1000 * (t1 - t0), self.ref_time / (t1 - t0)) + + if failed_init or not self.validate(res): + logger.error("failed_init %s; Computation modes %s", failed_init, comp) + logger.error("Failed on platform %s device %s", platform, device) + logger.error("Reference results: %s", self.ref) + logger.error("Faulty results: %s", res) + self.assertTrue(False, f"Stat calculation failed on {platform},{device} in mode {comp}") diff --git a/src/silx/opencl/utils.py b/src/silx/opencl/utils.py new file mode 100644 index 0000000..575e018 --- /dev/null +++ b/src/silx/opencl/utils.py @@ -0,0 +1,214 @@ +# -*- coding: utf-8 -*- +# /*########################################################################## +# Copyright (C) 2017 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ############################################################################*/ +""" +Project: Sift implementation in Python + OpenCL + https://github.com/silx-kit/silx +""" + +from __future__ import division + +__authors__ = ["Jérôme Kieffer", "Pierre Paleo"] +__contact__ = "jerome.kieffer@esrf.eu" +__license__ = "MIT" +__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France" +__date__ = "06/09/2017" +__status__ = "Production" + +import os +import numpy +from .. import resources +from math import log, ceil + + +def calc_size(shape, blocksize): + """ + Calculate the optimal size for a kernel according to the workgroup size + """ + if "__len__" in dir(blocksize): + return tuple((int(i) + int(j) - 1) & ~(int(j) - 1) for i, j in zip(shape, blocksize)) + else: + return tuple((int(i) + int(blocksize) - 1) & ~(int(blocksize) - 1) for i in shape) + + +def nextpower(n): + """Calculate the power of two + + :param n: an integer, for example 100 + :return: another integer, 100-> 128 + """ + return 1 << int(ceil(log(n, 2))) + + +def sizeof(shape, dtype="uint8"): + """ + Calculate the number of bytes needed to allocate for a given structure + + :param shape: size or tuple of sizes + :param dtype: data type + """ + itemsize = numpy.dtype(dtype).itemsize + cnt = 1 + if "__len__" in dir(shape): + for dim in shape: + cnt *= dim + else: + cnt = int(shape) + return cnt * itemsize + + +def get_cl_file(resource): + """get the full path of a openCL resource file + + The resource name can be prefixed by the name of a resource directory. For + example "silx:foo.png" identify the resource "foo.png" from the resource + directory "silx". + See also :func:`silx.resources.register_resource_directory`. + + :param str resource: Resource name. File name contained if the `opencl` + directory of the resources. + :return: the full path of the openCL source file + """ + if not resource.endswith(".cl"): + resource += ".cl" + return resources._resource_filename(resource, + default_directory="opencl") + + +def read_cl_file(filename): + """ + :param filename: read an OpenCL file and apply a preprocessor + :return: preprocessed source code + """ + with open(get_cl_file(filename), "r") as f: + # Dummy preprocessor which removes the #include + lines = [i for i in f.readlines() if not i.startswith("#include ")] + return "".join(lines) + + +get_opencl_code = read_cl_file + + +def concatenate_cl_kernel(filenames): + """Concatenates all the kernel from the list of files + + :param filenames: filenames containing the kernels + :type filenames: list of str which can be filename of kernel as a string. + :return: a string with all kernels concatenated + + this method concatenates all the kernel from the list + """ + return os.linesep.join(read_cl_file(fn) for fn in filenames) + + + + +class ConvolutionInfos(object): + allowed_axes = { + "1D": [None], + "separable_2D_1D_2D": [None, (0, 1), (1, 0)], + "batched_1D_2D": [(0,), (1,)], + "separable_3D_1D_3D": [ + None, + (0, 1, 2), + (1, 2, 0), + (2, 0, 1), + (2, 1, 0), + (1, 0, 2), + (0, 2, 1) + ], + "batched_1D_3D": [(0,), (1,), (2,)], + "batched_separable_2D_1D_3D": [(0,), (1,), (2,)], # unsupported (?) + "2D": [None], + "batched_2D_3D": [(0,), (1,), (2,)], + "separable_3D_2D_3D": [ + (1, 0), + (0, 1), + (2, 0), + (0, 2), + (1, 2), + (2, 1), + ], + "3D": [None], + } + use_cases = { + (1, 1): { + "1D": { + "name": "1D convolution on 1D data", + "kernels": ["convol_1D_X"], + }, + }, + (2, 2): { + "2D": { + "name": "2D convolution on 2D data", + "kernels": ["convol_2D_XY"], + }, + }, + (3, 3): { + "3D": { + "name": "3D convolution on 3D data", + "kernels": ["convol_3D_XYZ"], + }, + }, + (2, 1): { + "separable_2D_1D_2D": { + "name": "Separable (2D->1D) convolution on 2D data", + "kernels": ["convol_1D_X", "convol_1D_Y"], + }, + "batched_1D_2D": { + "name": "Batched 1D convolution on 2D data", + "kernels": ["convol_1D_X", "convol_1D_Y"], + }, + }, + (3, 1): { + "separable_3D_1D_3D": { + "name": "Separable (3D->1D) convolution on 3D data", + "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"], + }, + "batched_1D_3D": { + "name": "Batched 1D convolution on 3D data", + "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"], + }, + "batched_separable_2D_1D_3D": { + "name": "Batched separable (2D->1D) convolution on 3D data", + "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"], + }, + }, + (3, 2): { + "separable_3D_2D_3D": { + "name": "Separable (3D->2D) convolution on 3D data", + "kernels": ["convol_2D_XY", "convol_2D_XZ", "convol_2D_YZ"], + }, + "batched_2D_3D": { + "name": "Batched 2D convolution on 3D data", + "kernels": ["convol_2D_XY", "convol_2D_XZ", "convol_2D_YZ"], + }, + }, + } + + + + + + + |