14 files changed, 304 insertions, 228 deletions
diff --git a/silx/opencl/common.py b/silx/opencl/common.py
index 8d31c8a..110d941 100644
--- a/silx/opencl/common.py
+++ b/silx/opencl/common.py
@@ -34,7 +34,7 @@ __author__ = "Jerome Kieffer"
 __contact__ = "Jerome.Kieffer@ESRF.eu"
 __license__ = "MIT"
 __copyright__ = "2012-2017 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "07/06/2019"
+__date__ = "28/11/2019"
 __status__ = "stable"
 __all__ = ["ocl", "pyopencl", "mf", "release_cl_buffers", "allocate_cl_buffers",
            "measure_workgroup_size", "kernel_workgroup_size"]
@@ -60,8 +60,14 @@ else:
         logger.warning("Unable to import pyOpenCl. Please install it from: http://pypi.python.org/pypi/pyopencl")
         pyopencl = None
     else:
-        import pyopencl.array as array
-        mf = pyopencl.mem_flags
+        try:
+            pyopencl.get_platforms()
+        except pyopencl.LogicError:
+            logger.warning("The module pyOpenCL has been imported but can't be used here")
+            pyopencl = None
+        else:
+            import pyopencl.array as array
+            mf = pyopencl.mem_flags
 
 if pyopencl is None:
     # Define default mem flags
@@ -91,8 +97,8 @@ NVIDIA_FLOP_PER_CORE = {(1, 0): 24,  # Guessed !
                         (6, 0): 128,  # GP100
                         (6, 1): 128,  # GP104
                         (6, 2): 128,  # ?
-                        (7, 0): 256,  # Volta ?
-                        (7, 1): 256,  # Volta ?
+                        (7, 0): 128,  # Volta # measured on Telsa V100
+                        (7, 1): 128,  # Volta ?
                         }
 
 AMD_FLOP_PER_CORE = 160  # Measured on a M7820 10 core, 700MHz 1120GFlops
@@ -254,7 +260,8 @@ def _measure_workgroup_size(device_or_context, fast=False):
     max_valid_wg = 1
     data = numpy.random.random(shape).astype(numpy.float32)
     d_data = pyopencl.array.to_device(queue, data)
-    d_data_1 = pyopencl.array.zeros_like(d_data) + 1
+    d_data_1 = pyopencl.array.empty_like(d_data)
+    d_data_1.fill(numpy.float32(1.0))
 
     program = pyopencl.Program(ctx, get_opencl_code("addition")).build()
     if fast:
@@ -318,10 +325,21 @@ class OpenCL(object):
                     # pocl does not describe itself as a CPU !
                     devtype = "CPU"
                 if len(devtype) > 3:
-                    devtype = devtype[:3]
-                if _is_nvidia_gpu(pypl.vendor, devtype) and "compute_capability_major_nv" in dir(device):
-                    comput_cap = device.compute_capability_major_nv, device.compute_capability_minor_nv
-                    flop_core = NVIDIA_FLOP_PER_CORE.get(comput_cap, min(NVIDIA_FLOP_PER_CORE.values()))
+                    if "GPU" in devtype:
+                        devtype = "GPU"
+                    elif "ACC" in devtype:
+                        devtype = "ACC"
+                    elif "CPU" in devtype:
+                        devtype = "CPU"
+                    else:
+                        devtype = devtype[:3]
+                if _is_nvidia_gpu(device.vendor, devtype) and ("compute_capability_major_nv" in dir(device)):
+                    try:
+                        comput_cap = device.compute_capability_major_nv, device.compute_capability_minor_nv
+                    except pyopencl.LogicError:
+                        flop_core = FLOP_PER_CORE["GPU"]
+                    else:
+                        flop_core = NVIDIA_FLOP_PER_CORE.get(comput_cap, FLOP_PER_CORE["GPU"])
                 elif (pypl.vendor == "Advanced Micro Devices, Inc.") and (devtype == "GPU"):
                     flop_core = AMD_FLOP_PER_CORE
                 elif devtype == "CPU":
diff --git a/silx/opencl/convolution.py b/silx/opencl/convolution.py
index 18232f4..b8dd8f6 100644
--- a/silx/opencl/convolution.py
+++ b/silx/opencl/convolution.py
@@ -29,99 +29,14 @@ from __future__ import absolute_import, print_function, with_statement, division
 
 __authors__ = ["P. Paleo"]
 __license__ = "MIT"
-__date__ = "11/02/2019"
+__date__ = "01/08/2019"
 
 import numpy as np
-from math import ceil
 from copy import copy  # python2
 from .common import pyopencl as cl
 import pyopencl.array as parray
 from .processing import OpenclProcessing, EventDescription
-
-
-class ConvolutionInfos(object):
-    allowed_axes = {
-        "1D": [None],
-        "separable_2D_1D_2D": [None, (0, 1), (1, 0)],
-        "batched_1D_2D": [(0,), (1,)],
-        "separable_3D_1D_3D": [
-            None,
-            (0, 1, 2),
-            (1, 2, 0),
-            (2, 0, 1),
-            (2, 1, 0),
-            (1, 0, 2),
-            (0, 2, 1)
-        ],
-        "batched_1D_3D": [(0,), (1,), (2,)],
-        "batched_separable_2D_1D_3D": [(0,), (1,), (2,)], # unsupported (?)
-        "2D": [None],
-        "batched_2D_3D": [(0,), (1,), (2,)],
-        "separable_3D_2D_3D": [
-            (1, 0),
-            (0, 1),
-            (2, 0),
-            (0, 2),
-            (1, 2),
-            (2, 1),
-        ],
-        "3D": [None],
-    }
-    use_cases = {
-        (1, 1): {
-            "1D": {
-                "name": "1D convolution on 1D data",
-                "kernels": ["convol_1D_X"],
-            },
-        },
-        (2, 2): {
-            "2D": {
-                "name": "2D convolution on 2D data",
-                "kernels": ["convol_2D_XY"],
-            },
-        },
-        (3, 3): {
-            "3D": {
-                "name": "3D convolution on 3D data",
-                "kernels": ["convol_3D_XYZ"],
-            },
-        },
-        (2, 1): {
-            "separable_2D_1D_2D": {
-                "name": "Separable (2D->1D) convolution on 2D data",
-                "kernels": ["convol_1D_X", "convol_1D_Y"],
-            },
-            "batched_1D_2D": {
-                "name": "Batched 1D convolution on 2D data",
-                "kernels": ["convol_1D_X", "convol_1D_Y"],
-            },
-        },
-        (3, 1): {
-            "separable_3D_1D_3D": {
-                "name": "Separable (3D->1D) convolution on 3D data",
-                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
-            },
-            "batched_1D_3D": {
-                "name": "Batched 1D convolution on 3D data",
-                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
-            },
-            "batched_separable_2D_1D_3D": {
-                "name": "Batched separable (2D->1D) convolution on 3D data",
-                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
-            },
-        },
-        (3, 2): {
-            "separable_3D_2D_3D": {
-                "name": "Separable (3D->2D) convolution on 3D data",
-                "kernels": ["convol_2D_XY", "convol_2D_XZ", "convol_2D_YZ"],
-            },
-            "batched_2D_3D": {
-                "name": "Batched 2D convolution on 3D data",
-                "kernels": ["convol_2D_XY", "convol_2D_XZ", "convol_2D_YZ"],
-            },
-        },
-    }
-
+from .utils import ConvolutionInfos
 
 class Convolution(OpenclProcessing):
     """
@@ -262,7 +177,8 @@ class Convolution(OpenclProcessing):
         # Allocate arrays
         for option_name, array_name in option_array_names.items():
             if self.extra_options[option_name]:
-                value = parray.zeros(self.queue, self.shape, "f")
+                value = parray.empty(self.queue, self.shape, np.float32)
+                value.fill(np.float32(0.0))
             else:
                 value = None
             setattr(self, array_name, value)
@@ -320,11 +236,9 @@ class Convolution(OpenclProcessing):
                 str("-DIMAGE_DIMS=%d" % self.data_ndim),
                 str("-DFILTER_DIMS=%d" % self.kernel_ndim),
             ])
-            data_in_ref = self.data_in_tex
             d_kernel_ref = self.d_kernel_tex
         else:
             kernel_files = ["convolution.cl"]
-            data_in_ref = self.data_in.data
             d_kernel_ref = self.d_kernel.data
         self.compile_kernels(
             kernel_files=kernel_files,
@@ -335,8 +249,8 @@ class Convolution(OpenclProcessing):
         kernel_args = [
             self.queue,
             self.ndrange, self.wg,
-            data_in_ref,
-            self.data_out.data,
+            None,
+            None,
             d_kernel_ref,
             np.int32(self.kernel.shape[0]),
             self.Nx, self.Ny, self.Nz
@@ -527,29 +441,3 @@ class Convolution(OpenclProcessing):
     __call__ = convolve
 
 
-def gaussian_kernel(sigma, cutoff=4, force_odd_size=False):
-    """
-    Generates a Gaussian convolution kernel.
-
-    :param sigma: Standard Deviation of the Gaussian curve.
-    :param cutoff: Parameter tuning the truncation of the Gaussian.
-        The higher cutoff, the biggest the array will be (and the closest to
-        a "true" Gaussian function).
-    :param force_odd_size: when set to True, the resulting array will always
-        have an odd size, regardless of the values of "sigma" and "cutoff".
-    :return: a numpy.ndarray containing the truncated Gaussian function.
-        The array size is 2*c*s+1 where c=cutoff, s=sigma.
-
-    Nota: due to the quick decay of the Gaussian function, small values of the
-        "cutoff" parameter are usually fine. The energy difference between a
-        Gaussian truncated to [-c, c] and a "true" one is
-            erfc(c/(sqrt(2)*s))
-        so choosing cutoff=4*sigma keeps the truncation error below 1e-4.
-    """
-    size = int(ceil(2 * cutoff * sigma + 1))
-    if force_odd_size and size % 2 == 0:
-        size += 1
-    x = np.arange(size) - (size - 1.0) / 2.0
-    g = np.exp(-(x / sigma) ** 2 / 2.0)
-    g /= g.sum()
-    return g
diff --git a/silx/opencl/linalg.py b/silx/opencl/linalg.py
index fba3e9c..a64122a 100644
--- a/silx/opencl/linalg.py
+++ b/silx/opencl/linalg.py
@@ -29,7 +29,7 @@ from __future__ import absolute_import, print_function, with_statement, division
 
 __authors__ = ["P. Paleo"]
 __license__ = "MIT"
-__date__ = "10/08/2017"
+__date__ = "01/08/2019"
 
 import numpy as np
 
@@ -63,8 +63,10 @@ class LinAlg(OpenclProcessing):
                                   platformid=platformid, deviceid=deviceid,
                                   profile=profile)
 
-        self.d_gradient = parray.zeros(self.queue, shape, np.complex64)
-        self.d_image = parray.zeros(self.queue, shape, np.float32)
+        self.d_gradient = parray.empty(self.queue, shape, np.complex64)
+        self.d_gradient.fill(np.complex64(0.0))
+        self.d_image = parray.empty(self.queue, shape, np.float32)
+        self.d_image.fill(np.float32(0.0))
         self.add_to_cl_mem({
             "d_gradient": self.d_gradient,
             "d_image": self.d_image
diff --git a/silx/opencl/processing.py b/silx/opencl/processing.py
index 707aa72..6b475b9 100644
--- a/silx/opencl/processing.py
+++ b/silx/opencl/processing.py
@@ -41,7 +41,7 @@ __author__ = "Jerome Kieffer"
 __contact__ = "Jerome.Kieffer@ESRF.eu"
 __license__ = "MIT"
 __copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "11/01/2019"
+__date__ = "05/08/2019"
 __status__ = "stable"
 
 
@@ -152,9 +152,12 @@ class OpenclProcessing(object):
     def __del__(self):
         """Destructor: release all buffers and programs
         """
-        self.reset_log()
-        self.free_kernels()
-        self.free_buffers()
+        try:
+            self.reset_log()
+            self.free_kernels()
+            self.free_buffers()
+        except Exception:
+            pass
         self.queue = None
         self.device = None
         self.ctx = None
diff --git a/silx/opencl/projection.py b/silx/opencl/projection.py
index 0505d80..da8752f 100644
--- a/silx/opencl/projection.py
+++ b/silx/opencl/projection.py
@@ -29,7 +29,7 @@ from __future__ import absolute_import, print_function, with_statement, division
 
 __authors__ = ["A. Mirone, P. Paleo"]
 __license__ = "MIT"
-__date__ = "28/02/2018"
+__date__ = "01/08/2019"
 
 import logging
 import numpy as np
@@ -141,10 +141,11 @@ class Projection(OpenclProcessing):
             BufferDescription("d_strideJoseph", self._dimrecy * 2, np.int32, mf.READ_ONLY),
             BufferDescription("d_strideLine", self._dimrecy * 2, np.int32, mf.READ_ONLY),
         ]
+        d_axis_corrections = parray.empty(self.queue, self.nprojs, np.float32)
+        d_axis_corrections.fill(np.float32(0.0))
         self.add_to_cl_mem(
             {
-                "d_axis_corrections": parray.zeros(self.queue,
-                                                   self.nprojs, np.float32)
+                "d_axis_corrections": d_axis_corrections
             }
         )
         self._tmp_extended_img = np.zeros((self.shape[0] + 2, self.shape[1] + 2),
@@ -193,7 +194,9 @@ class Projection(OpenclProcessing):
         pyopencl.enqueue_copy(self.queue, self.cl_mem["d_angles"], angles2)
 
     def allocate_slice(self):
-            self.add_to_cl_mem({"d_slice": parray.zeros(self.queue, (self.shape[1] + 2, self.shape[1] + 2), np.float32)})
+        ary = parray.zeros(self.queue, (self.shape[1] + 2, self.shape[1] + 2), np.float32)
+        ary.fill(0)
+        self.add_to_cl_mem({"d_slice": ary})
 
     def allocate_textures(self):
         self.d_image_tex = pyopencl.Image(
diff --git a/silx/opencl/reconstruction.py b/silx/opencl/reconstruction.py
index cb68680..2c84aee 100644
--- a/silx/opencl/reconstruction.py
+++ b/silx/opencl/reconstruction.py
@@ -29,7 +29,7 @@ from __future__ import absolute_import, print_function, with_statement, division
 
 __authors__ = ["P. Paleo"]
 __license__ = "MIT"
-__date__ = "19/09/2017"
+__date__ = "01/08/2019"
 
 import logging
 import numpy as np
@@ -98,19 +98,23 @@ class ReconstructionAlgorithm(OpenclProcessing):
         self.sino_shape = sino_shape
         self.is_cpu = self.backprojector.is_cpu
         # Arrays
-        self.d_data = parray.zeros(self.queue, sino_shape, dtype=np.float32)
-        self.d_sino = parray.zeros_like(self.d_data)
-        self.d_x = parray.zeros(self.queue,
+        self.d_data = parray.empty(self.queue, sino_shape, dtype=np.float32)
+        self.d_data.fill(0.0)
+        self.d_sino = parray.empty_like(self.d_data)
+        self.d_sino.fill(0.0)
+        self.d_x = parray.empty(self.queue,
                                 self.backprojector.slice_shape,
                                 dtype=np.float32)
-        self.d_x_old = parray.zeros_like(self.d_x)
+        self.d_x.fill(0.0)
+        self.d_x_old = parray.empty_like(self.d_x)
+        self.d_x_old.fill(0.0)
 
         self.add_to_cl_mem({
-            "d_data": self.d_data,
-            "d_sino": self.d_sino,
-            "d_x": self.d_x,
-            "d_x_old": self.d_x_old,
-        })
+                            "d_data": self.d_data,
+                            "d_sino": self.d_sino,
+                            "d_x": self.d_x,
+                            "d_x_old": self.d_x_old,
+                            })
 
     def proj(self, d_slice, d_sino):
         """
@@ -276,10 +280,13 @@ class TV(ReconstructionAlgorithm):
         )
         # Additional arrays
         self.linalg.gradient(self.d_x)
-        self.d_p = parray.zeros_like(self.linalg.cl_mem["d_gradient"])
-        self.d_q = parray.zeros_like(self.d_data)
+        self.d_p = parray.empty_like(self.linalg.cl_mem["d_gradient"])
+        self.d_q = parray.empty_like(self.d_data)
         self.d_g = self.linalg.d_image
-        self.d_tmp = parray.zeros_like(self.d_x)
+        self.d_tmp = parray.empty_like(self.d_x)
+        self.d_p.fill(0)
+        self.d_q.fill(0)
+        self.d_tmp.fill(0)
         self.add_to_cl_mem({
             "d_p": self.d_p,
             "d_q": self.d_q,
diff --git a/silx/opencl/sinofilter.py b/silx/opencl/sinofilter.py
index ad17acb..d608744 100644
--- a/silx/opencl/sinofilter.py
+++ b/silx/opencl/sinofilter.py
@@ -2,7 +2,7 @@
 # coding: utf-8
 # /*##########################################################################
 #
-# Copyright (c) 2016 European Synchrotron Radiation Facility
+# Copyright (c) 2016-2019 European Synchrotron Radiation Facility
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -38,19 +38,20 @@ from math import pi
 import pyopencl.array as parray
 from .common import pyopencl as cl
 from .processing import OpenclProcessing
-from ..math.fft import FFT
-from ..math.fft.clfft import __have_clfft__
+from ..math.fft.clfft import CLFFT, __have_clfft__
+from ..math.fft.npfft import NPFFT
 from ..image.tomography import generate_powers, get_next_power, compute_fourier_filter
 from ..utils.deprecation import deprecated
 
 
 
 class SinoFilter(OpenclProcessing):
-    """
-    A class for performing sinogram filtering on GPU using OpenCL.
+    """A class for performing sinogram filtering on GPU using OpenCL.
+
     This is a convolution in the Fourier space, along one dimension:
-      - In 2D: (n_a, d_x): n_a filterings (1D FFT of size d_x)
-      - In 3D: (n_z, n_a, d_x): n_z*n_a filterings (1D FFT of size d_x)
+
+    - In 2D: (n_a, d_x): n_a filterings (1D FFT of size d_x)
+    - In 3D: (n_z, n_a, d_x): n_z*n_a filterings (1D FFT of size d_x)
     """
     kernel_files = ["array_utils.cl"]
     powers = generate_powers()
@@ -121,11 +122,10 @@ class SinoFilter(OpenclProcessing):
     def _init_fft(self):
         if __have_clfft__ and not(self.extra_options["use_numpy_fft"]):
             self.fft_backend = "opencl"
-            self.fft = FFT(
+            self.fft = CLFFT(
                 self.sino_padded_shape,
                 dtype=np.float32,
                 axes=(-1,),
-                backend="opencl",
                 ctx=self.ctx,
             )
         else:
@@ -133,10 +133,9 @@ class SinoFilter(OpenclProcessing):
             print("The gpyfft module was not found. The Fourier transforms "
                   "will be done on CPU. For more performances, it is advised "
                   "to install gpyfft.""")
-            self.fft = FFT(
+            self.fft = NPFFT(
                 template=np.zeros(self.sino_padded_shape, "f"),
                 axes=(-1,),
-                backend="numpy",
             )
 
     def _allocate_memory(self):
diff --git a/silx/opencl/sparse.py b/silx/opencl/sparse.py
index 8bfaea8..514589a 100644
--- a/silx/opencl/sparse.py
+++ b/silx/opencl/sparse.py
@@ -35,6 +35,7 @@ import numpy
 import pyopencl.array as parray
 from collections import namedtuple
 from pyopencl.scan import GenericScanKernel
+from pyopencl.tools import dtype_to_ctype
 from .common import pyopencl as cl
 from .processing import OpenclProcessing, EventDescription, BufferDescription
 mf = cl.mem_flags
@@ -52,19 +53,20 @@ def tuple_to_csrdata(arrs):
 
 
 
-# only float32 arrays are supported for now
 class CSR(OpenclProcessing):
     kernel_files = ["sparse.cl"]
 
-    def __init__(self, shape, max_nnz=None, ctx=None, devicetype="all",
-                 platformid=None, deviceid=None, block_size=None, memory=None,
-                 profile=False):
+    def __init__(self, shape, dtype="f", max_nnz=None, idx_dtype=numpy.int32,
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 block_size=None, memory=None, profile=False):
         """
         Compute Compressed Sparse Row format of an image (2D matrix).
         It is designed to be compatible with scipy.sparse.csr_matrix.
 
         :param shape: tuple
             Matrix shape.
+        :param dtype: str or numpy.dtype, optional
+            Numeric data type. By default, sparse matrix data will be float32.
         :param max_nnz: int, optional
             Maximum number of non-zero elements. By default, the arrays "data"
             and "indices" are allocated with prod(shape) elements, but
@@ -80,8 +82,9 @@ class CSR(OpenclProcessing):
 
         OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
                                   platformid=platformid, deviceid=deviceid,
+                                  block_size=block_size, memory=memory,
                                   profile=profile)
-        self._set_parameters(shape, max_nnz)
+        self._set_parameters(shape, dtype, max_nnz, idx_dtype)
         self._allocate_memory()
         self._setup_kernels()
 
@@ -89,22 +92,47 @@ class CSR(OpenclProcessing):
     # -------------------------- Initialization --------------------------------
     # --------------------------------------------------------------------------
 
-    def _set_parameters(self, shape, max_nnz):
+    def _set_parameters(self, shape, dtype, max_nnz, idx_dtype):
         self.shape = shape
         self.size = numpy.prod(shape)
-        self.indice_dtype = numpy.int32 #
+        self._set_idx_dtype(idx_dtype)
         assert len(shape) == 2 #
         if max_nnz is None:
             self.max_nnz = numpy.prod(shape) # worst case
         else:
             self.max_nnz = int(max_nnz)
+        self._set_dtype(dtype)
+
+
+    def _set_idx_dtype(self, idx_dtype):
+        idx_dtype = numpy.dtype(idx_dtype)
+        if idx_dtype.kind not in ["i", "u"]:
+            raise ValueError("Not an integer type: %s" % idx_dtype)
+        # scan value type must have size divisible by 4 bytes
+        if idx_dtype.itemsize % 4 != 0:
+            raise ValueError("Due to an internal pyopencl limitation, idx_dtype type must have size divisible by 4 bytes")
+        self.indice_dtype = idx_dtype #
+
+
+    def _set_dtype(self, dtype):
+        self.dtype = numpy.dtype(dtype)
+        if self.dtype.kind == "c":
+            raise ValueError("Complex data is not supported")
+        if self.dtype == numpy.dtype(numpy.float32):
+            self._c_zero_str = "0.0f"
+        elif self.dtype == numpy.dtype(numpy.float64):
+            self._c_zero_str = "0.0"
+        else: # assuming integer
+            self._c_zero_str = "0"
+        self.c_dtype = dtype_to_ctype(self.dtype)
+        self.idx_c_dtype = dtype_to_ctype(self.indice_dtype)
 
 
     def _allocate_memory(self):
         self.is_cpu = (self.device.type == "CPU") # move to OpenclProcessing ?
         self.buffers = [
-            BufferDescription("array", (self.size,), numpy.float32, mf.READ_ONLY),
-            BufferDescription("data", (self.max_nnz,), numpy.float32, mf.READ_WRITE),
+            BufferDescription("array", (self.size,), self.dtype, mf.READ_ONLY),
+            BufferDescription("data", (self.max_nnz,), self.dtype, mf.READ_WRITE),
             BufferDescription("indices", (self.max_nnz,), self.indice_dtype, mf.READ_WRITE),
             BufferDescription("indptr", (self.shape[0]+1,), self.indice_dtype, mf.READ_WRITE),
         ]
@@ -124,10 +152,24 @@ class CSR(OpenclProcessing):
 
 
     def _setup_compaction_kernel(self):
+        kernel_signature = str(
+            "__global %s *data, \
+            __global %s *data_compacted, \
+            __global %s *indices, \
+            __global %s* indptr \
+            """ % (self.c_dtype, self.c_dtype, self.idx_c_dtype, self.idx_c_dtype)
+        )
+        if self.dtype.kind == "f":
+            map_nonzero_expr = "(fabs(data[i]) > %s) ? 1 : 0" % self._c_zero_str
+        elif self.dtype.kind in ["u", "i"]:
+            map_nonzero_expr = "(data[i] != %s) ? 1 : 0" % self._c_zero_str
+        else:
+            raise ValueError("Unknown data type")
+
         self.scan_kernel = GenericScanKernel(
             self.ctx, self.indice_dtype,
-            arguments="__global float* data, __global float *data_compacted, __global int *indices, __global int* indptr",
-            input_expr="(fabs(data[i]) > 0.0f) ? 1 : 0",
+            arguments=kernel_signature,
+            input_expr=map_nonzero_expr,
             scan_expr="a+b", neutral="0",
             output_statement="""
                 // item is the running sum of input_expr(i), i.e the cumsum of "nonzero"
@@ -140,7 +182,7 @@ class CSR(OpenclProcessing):
                     indptr[(i/IMAGE_WIDTH)+1] = item;
                 }
                 """,
-            options="-DIMAGE_WIDTH=%d" % self.shape[1],
+            options=["-DIMAGE_WIDTH=%d" % self.shape[1]],
             preamble="#define GET_INDEX(i) (i % IMAGE_WIDTH)",
         )
 
@@ -149,7 +191,11 @@ class CSR(OpenclProcessing):
         OpenclProcessing.compile_kernels(
             self,
             self.kernel_files,
-            compile_options=["-DIMAGE_WIDTH=%d" % self.shape[1]]
+            compile_options=[
+                "-DIMAGE_WIDTH=%d" % self.shape[1],
+                "-DDTYPE=%s" % self.c_dtype,
+                "-DIDX_DTYPE=%s" % self.idx_c_dtype,
+            ]
         )
         device = self.ctx.devices[0]
         wg_x = min(
@@ -174,7 +220,7 @@ class CSR(OpenclProcessing):
             2D array in dense format.
         """
         assert arr.size == self.size
-        assert arr.dtype == numpy.float32
+        assert arr.dtype == self.dtype
 
 
     # TODO handle pyopencl Buffer
@@ -189,10 +235,10 @@ class CSR(OpenclProcessing):
         assert isinstance(csr_data, CSRData)
         for arr in [csr_data.data, csr_data.indices, csr_data.indptr]:
             assert arr.ndim == 1
-        assert csr_data.data.size == self.max_nnz
-        assert csr_data.indices.size == self.max_nnz
+        assert csr_data.data.size <= self.max_nnz
+        assert csr_data.indices.size <= self.max_nnz
         assert csr_data.indptr.size == self.shape[0]+1
-        assert csr_data.data.dtype == numpy.float32
+        assert csr_data.data.dtype == self.dtype
         assert csr_data.indices.dtype == self.indice_dtype
         assert csr_data.indptr.dtype == self.indice_dtype
 
@@ -228,7 +274,7 @@ class CSR(OpenclProcessing):
                 setattr(self, name, arr)
             # The current array is a numpy.ndarray: copy H2D
             elif isinstance(arr, numpy.ndarray):
-                getattr(self, name)[:] = arr[:]
+                getattr(self, name)[:arr.size] = arr[:]
             else:
                 raise ValueError("Unsupported array type: %s" % type(arr))
 
diff --git a/silx/opencl/test/test_addition.py b/silx/opencl/test/test_addition.py
index 3283a4d..49cc0b4 100644
--- a/silx/opencl/test/test_addition.py
+++ b/silx/opencl/test/test_addition.py
@@ -35,7 +35,7 @@ __authors__ = ["Henri Payno, Jérôme Kieffer"]
 __contact__ = "jerome.kieffer@esrf.eu"
 __license__ = "MIT"
 __copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "05/07/2018"
+__date__ = "01/08/2019"
 
 import logging
 import numpy
@@ -80,7 +80,8 @@ class TestAddition(unittest.TestCase):
         self.shape = 4096
         self.data = numpy.random.random(self.shape).astype(numpy.float32)
         self.d_array_img = pyopencl.array.to_device(self.queue, self.data)
-        self.d_array_5 = pyopencl.array.zeros_like(self.d_array_img) - 5
+        self.d_array_5 = pyopencl.array.empty_like(self.d_array_img)
+        self.d_array_5.fill(-5)
         self.program = pyopencl.Program(self.ctx, get_opencl_code("addition")).build()
 
     def tearDown(self):
diff --git a/silx/opencl/test/test_convolution.py b/silx/opencl/test/test_convolution.py
index 8acd385..c213808 100644
--- a/silx/opencl/test/test_convolution.py
+++ b/silx/opencl/test/test_convolution.py
@@ -34,12 +34,13 @@ __authors__ = ["Pierre Paleo"]
 __contact__ = "pierre.paleo@esrf.fr"
 __license__ = "MIT"
 __copyright__ = "2019 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "15/02/2019"
+__date__ = "01/08/2019"
 
 import logging
 from itertools import product
 import numpy as np
 from silx.utils.testutils import parameterize
+from silx.image.utils import gaussian_kernel
 try:
     from scipy.ndimage import convolve, convolve1d
     from scipy.misc import ascent
@@ -52,7 +53,7 @@ from ..common import ocl
 if ocl:
     import pyopencl as cl
     import pyopencl.array as parray
-    from ..convolution import Convolution, gaussian_kernel
+    from silx.opencl.convolution import Convolution
 logger = logging.getLogger(__name__)
 
 
@@ -178,7 +179,8 @@ class TestConvolution(unittest.TestCase):
         else:
             data_ref = data
         if self.param["output_on_device"]:
-            d_res = parray.zeros_like(conv.data_out)
+            d_res = parray.empty_like(conv.data_out)
+            d_res.fill(0)
             res = d_res
         else:
             res = None
diff --git a/silx/opencl/test/test_kahan.py b/silx/opencl/test/test_kahan.py
index bb9ea3f..167640c 100644
--- a/silx/opencl/test/test_kahan.py
+++ b/silx/opencl/test/test_kahan.py
@@ -34,7 +34,7 @@ __author__ = "Jérôme Kieffer"
 __contact__ = "Jerome.Kieffer@ESRF.eu"
 __license__ = "MIT"
 __copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "30/01/2019"
+__date__ = "01/08/2019"
 
 
 import unittest
@@ -120,7 +120,8 @@ class TestKahan(unittest.TestCase):
         """
         prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(self.args)
         ones_d = pyopencl.array.to_device(self.queue, data)
-        res_d = pyopencl.array.zeros(self.queue, 2, numpy.float32)
+        res_d = pyopencl.array.empty(self.queue, 2, numpy.float32)
+        res_d.fill(0)
         evt = prg.summation(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
         evt.wait()
         res = res_d.get().sum(dtype=numpy.float64)
@@ -202,7 +203,8 @@ class TestKahan(unittest.TestCase):
 
         prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(self.args)
         ones_d = pyopencl.array.to_device(self.queue, data)
-        res_d = pyopencl.array.zeros(self.queue, 2, numpy.float32)
+        res_d = pyopencl.array.empty(self.queue, 2, numpy.float32)
+        res_d.fill(0)
         evt = prg.test_dot16(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
         evt.wait()
         res = res_d.get().sum(dtype="float64")
diff --git a/silx/opencl/test/test_linalg.py b/silx/opencl/test/test_linalg.py
index 7d03983..b72fa20 100644
--- a/silx/opencl/test/test_linalg.py
+++ b/silx/opencl/test/test_linalg.py
@@ -30,7 +30,7 @@ from __future__ import division, print_function
 __authors__ = ["Pierre paleo"]
 __license__ = "MIT"
 __copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "14/06/2017"
+__date__ = "01/08/2019"
 
 
 import time
@@ -109,7 +109,8 @@ class TestLinAlg(unittest.TestCase):
         self.div_ref = divergence(self.grad_ref)
         self.image2 = np.zeros_like(self.image)
         # Device images
-        self.gradient_parray = parray.zeros(self.la.queue, self.image.shape, np.complex64)
+        self.gradient_parray = parray.empty(self.la.queue, self.image.shape, np.complex64)
+        self.gradient_parray.fill(0)
         # we should be using cl.Buffer(self.la.ctx, cl.mem_flags.READ_WRITE, size=self.image.nbytes*2),
         # but platforms not suporting openCL 1.2 have a problem with enqueue_fill_buffer,
         # so we use the parray "fill" utility
diff --git a/silx/opencl/test/test_sparse.py b/silx/opencl/test/test_sparse.py
index 56f1ba4..76a6a0a 100644
--- a/silx/opencl/test/test_sparse.py
+++ b/silx/opencl/test/test_sparse.py
@@ -82,39 +82,45 @@ class TestCSR(unittest.TestCase):
     """Test CSR format"""
 
     def setUp(self):
-        self.array = generate_sparse_random_data(shape=(512, 511))
-        # Compute reference sparsification
-        a_s = sp.csr_matrix(self.array)
-        self.ref_data = a_s.data
-        self.ref_indices = a_s.indices
-        self.ref_indptr = a_s.indptr
-        self.ref_nnz = a_s.nnz
         # Test possible configurations
         input_on_device = [False, True]
         output_on_device = [False, True]
-        self._test_configs = list(product(input_on_device, output_on_device))
+        dtypes = [np.float32, np.int32, np.uint16]
+        self._test_configs = list(product(input_on_device, output_on_device, dtypes))
+
+
+    def compute_ref_sparsification(self, array):
+        ref_sparse = sp.csr_matrix(array)
+        return ref_sparse
 
 
     def test_sparsification(self):
-        for input_on_device, output_on_device in self._test_configs:
-            self._test_sparsification(input_on_device, output_on_device)
+        for input_on_device, output_on_device, dtype in self._test_configs:
+            self._test_sparsification(input_on_device, output_on_device, dtype)
 
 
-    def _test_sparsification(self, input_on_device, output_on_device):
-        current_config = "input on device: %s, output on device: %s" % (
-            str(input_on_device), str(output_on_device)
+    def _test_sparsification(self, input_on_device, output_on_device, dtype):
+        current_config = "input on device: %s, output on device: %s, dtype: %s" % (
+            str(input_on_device), str(output_on_device), str(dtype)
         )
+        logger.debug("CSR: %s" % current_config)
+        # Generate data and reference CSR
+        array = generate_sparse_random_data(shape=(512, 511), dtype=dtype)
+        ref_sparse = self.compute_ref_sparsification(array)
         # Sparsify on device
-        csr = CSR(self.array.shape)
+        csr = CSR(array.shape, dtype=dtype)
         if input_on_device:
             # The array has to be flattened
-            arr = parray.to_device(csr.queue, self.array.ravel())
+            arr = parray.to_device(csr.queue, array.ravel())
         else:
-            arr = self.array
+            arr = array
         if output_on_device:
-            d_data = parray.zeros_like(csr.data)
-            d_indices = parray.zeros_like(csr.indices)
-            d_indptr = parray.zeros_like(csr.indptr)
+            d_data = parray.empty_like(csr.data)
+            d_indices = parray.empty_like(csr.indices)
+            d_indptr = parray.empty_like(csr.indptr)
+            d_data.fill(0)
+            d_indices.fill(0)
+            d_indptr.fill(0)
             output = (d_data, d_indices, d_indptr)
         else:
             output = None
@@ -124,45 +130,50 @@ class TestCSR(unittest.TestCase):
             indices = indices.get()
             indptr = indptr.get()
         # Compare
-        nnz = self.ref_nnz
+        nnz = ref_sparse.nnz
         self.assertTrue(
-            np.allclose(data[:nnz], self.ref_data),
+            np.allclose(data[:nnz], ref_sparse.data),
             "something wrong with sparsified data (%s)"
             % current_config
         )
         self.assertTrue(
-            np.allclose(indices[:nnz], self.ref_indices),
+            np.allclose(indices[:nnz], ref_sparse.indices),
             "something wrong with sparsified indices (%s)"
             % current_config
         )
         self.assertTrue(
-            np.allclose(indptr, self.ref_indptr),
+            np.allclose(indptr, ref_sparse.indptr),
             "something wrong with sparsified indices pointers (indptr) (%s)"
             % current_config
         )
 
 
     def test_desparsification(self):
-        for input_on_device, output_on_device in self._test_configs:
-            self._test_desparsification(input_on_device, output_on_device)
+        for input_on_device, output_on_device, dtype in self._test_configs:
+            self._test_desparsification(input_on_device, output_on_device, dtype)
 
 
-    def _test_desparsification(self, input_on_device, output_on_device):
-        current_config = "input on device: %s, output on device: %s" % (
-            str(input_on_device), str(output_on_device)
+    def _test_desparsification(self, input_on_device, output_on_device, dtype):
+        current_config = "input on device: %s, output on device: %s, dtype: %s" % (
+            str(input_on_device), str(output_on_device), str(dtype)
         )
+        logger.debug("CSR: %s" % current_config)
+        # Generate data and reference CSR
+        array = generate_sparse_random_data(shape=(512, 511), dtype=dtype)
+        ref_sparse = self.compute_ref_sparsification(array)
         # De-sparsify on device
-        csr = CSR(self.array.shape, max_nnz=self.ref_nnz)
+        csr = CSR(array.shape, dtype=dtype, max_nnz=ref_sparse.nnz)
         if input_on_device:
-            data = parray.to_device(csr.queue, self.ref_data)
-            indices = parray.to_device(csr.queue, self.ref_indices)
-            indptr = parray.to_device(csr.queue, self.ref_indptr)
+            data = parray.to_device(csr.queue, ref_sparse.data)
+            indices = parray.to_device(csr.queue, ref_sparse.indices)
+            indptr = parray.to_device(csr.queue, ref_sparse.indptr)
         else:
-            data = self.ref_data
-            indices = self.ref_indices
-            indptr = self.ref_indptr
+            data = ref_sparse.data
+            indices = ref_sparse.indices
+            indptr = ref_sparse.indptr
         if output_on_device:
-            d_arr = parray.zeros_like(csr.array)
+            d_arr = parray.empty_like(csr.array)
+            d_arr.fill(0)
             output = d_arr
         else:
             output = None
@@ -171,7 +182,7 @@ class TestCSR(unittest.TestCase):
             arr = arr.get()
         # Compare
         self.assertTrue(
-            np.allclose(arr.reshape(self.array.shape), self.array),
+            np.allclose(arr.reshape(array.shape), array),
             "something wrong with densified data (%s)"
             % current_config
         )
diff --git a/silx/opencl/utils.py b/silx/opencl/utils.py
index f558c1b..575e018 100644
--- a/silx/opencl/utils.py
+++ b/silx/opencl/utils.py
@@ -119,3 +119,96 @@ def concatenate_cl_kernel(filenames):
     this method concatenates all the kernel from the list
     """
     return os.linesep.join(read_cl_file(fn) for fn in filenames)
+
+
+
+
+class ConvolutionInfos(object):
+    allowed_axes = {
+        "1D": [None],
+        "separable_2D_1D_2D": [None, (0, 1), (1, 0)],
+        "batched_1D_2D": [(0,), (1,)],
+        "separable_3D_1D_3D": [
+            None,
+            (0, 1, 2),
+            (1, 2, 0),
+            (2, 0, 1),
+            (2, 1, 0),
+            (1, 0, 2),
+            (0, 2, 1)
+        ],
+        "batched_1D_3D": [(0,), (1,), (2,)],
+        "batched_separable_2D_1D_3D": [(0,), (1,), (2,)], # unsupported (?)
+        "2D": [None],
+        "batched_2D_3D": [(0,), (1,), (2,)],
+        "separable_3D_2D_3D": [
+            (1, 0),
+            (0, 1),
+            (2, 0),
+            (0, 2),
+            (1, 2),
+            (2, 1),
+        ],
+        "3D": [None],
+    }
+    use_cases = {
+        (1, 1): {
+            "1D": {
+                "name": "1D convolution on 1D data",
+                "kernels": ["convol_1D_X"],
+            },
+        },
+        (2, 2): {
+            "2D": {
+                "name": "2D convolution on 2D data",
+                "kernels": ["convol_2D_XY"],
+            },
+        },
+        (3, 3): {
+            "3D": {
+                "name": "3D convolution on 3D data",
+                "kernels": ["convol_3D_XYZ"],
+            },
+        },
+        (2, 1): {
+            "separable_2D_1D_2D": {
+                "name": "Separable (2D->1D) convolution on 2D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y"],
+            },
+            "batched_1D_2D": {
+                "name": "Batched 1D convolution on 2D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y"],
+            },
+        },
+        (3, 1): {
+            "separable_3D_1D_3D": {
+                "name": "Separable (3D->1D) convolution on 3D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
+            },
+            "batched_1D_3D": {
+                "name": "Batched 1D convolution on 3D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
+            },
+            "batched_separable_2D_1D_3D": {
+                "name": "Batched separable (2D->1D) convolution on 3D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
+            },
+        },
+        (3, 2): {
+            "separable_3D_2D_3D": {
+                "name": "Separable (3D->2D) convolution on 3D data",
+                "kernels": ["convol_2D_XY", "convol_2D_XZ", "convol_2D_YZ"],
+            },
+            "batched_2D_3D": {
+                "name": "Batched 2D convolution on 3D data",
+                "kernels": ["convol_2D_XY", "convol_2D_XZ", "convol_2D_YZ"],
+            },
+        },
+    }
+
+
+
+
+
+
+