10 files changed, 1507 insertions, 184 deletions
diff --git a/silx/opencl/backprojection.py b/silx/opencl/backprojection.py
index c257872..5a4087b 100644
--- a/silx/opencl/backprojection.py
+++ b/silx/opencl/backprojection.py
@@ -154,6 +154,9 @@ class Backprojection(OpenclProcessing):
         self.extra_options = {
             "cutoff": 1.,
             "use_numpy_fft": False,
+            # It is  axis_pos - (num_bins-1)/2  in PyHST
+            "gpu_offset_x": 0., #self.axis_pos - (self.num_bins - 1) / 2.,
+            "gpu_offset_y": 0., #self.axis_pos - (self.num_bins - 1) / 2.
         }
         if extra_options is not None:
             self.extra_options.update(extra_options)
@@ -221,10 +224,10 @@ class Backprojection(OpenclProcessing):
             self.cl_mem["_d_slice"].data,
             # d_sino (__read_only image2d_t or float*)
             d_sino_ref,
-            # gpu_offset_x (float32) # TODO custom ?
-            -np.float32((self.num_bins - 1) / 2. - self.axis_pos),
-            # gpu_offset_y (float32) # TODO custom ?
-            -np.float32((self.num_bins - 1) / 2. - self.axis_pos),
+            # gpu_offset_x (float32) 
+            np.float32(self.extra_options["gpu_offset_x"]),
+            # gpu_offset_y (float32)
+            np.float32(self.extra_options["gpu_offset_y"]),
             # d_cos (__global float32*)
             self.cl_mem["d_cos"].data,
             # d_sin (__global float32*)
diff --git a/silx/opencl/common.py b/silx/opencl/common.py
index 9a04035..8d31c8a 100644
--- a/silx/opencl/common.py
+++ b/silx/opencl/common.py
@@ -34,7 +34,7 @@ __author__ = "Jerome Kieffer"
 __contact__ = "Jerome.Kieffer@ESRF.eu"
 __license__ = "MIT"
 __copyright__ = "2012-2017 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "20/09/2018"
+__date__ = "07/06/2019"
 __status__ = "stable"
 __all__ = ["ocl", "pyopencl", "mf", "release_cl_buffers", "allocate_cl_buffers",
            "measure_workgroup_size", "kernel_workgroup_size"]
@@ -158,6 +158,11 @@ class Device(object):
                "Available\t:\t%s" % self.available]
         return os.linesep.join(lst)
 
+    def set_unavailable(self):
+        """Use this method to flag a faulty device
+        """
+        self.available = False
+
 
 class Platform(object):
     """
@@ -220,8 +225,16 @@ def _measure_workgroup_size(device_or_context, fast=False):
     :return: maximum size for the workgroup
     """
     if isinstance(device_or_context, pyopencl.Device):
-        ctx = pyopencl.Context(devices=[device_or_context])
-        device = device_or_context
+        try:
+            ctx = pyopencl.Context(devices=[device_or_context])
+        except pyopencl._cl.LogicError as error:
+            platform = device_or_context.platform
+            platformid = pyopencl.get_platforms().index(platform)
+            deviceid = platform.get_devices().index(device_or_context)
+            ocl.platforms[platformid].devices[deviceid].set_unavailable()
+            raise RuntimeError("Unable to create context on %s/%s: %s" % (platform, device_or_context, error))
+        else:
+            device = device_or_context
     elif isinstance(device_or_context, pyopencl.Context):
         ctx = device_or_context
         device = device_or_context.devices[0]
@@ -379,6 +392,8 @@ class OpenCL(object):
         best_found = None
         for platformid, platform in enumerate(self.platforms):
             for deviceid, device in enumerate(platform.devices):
+                if not device.available:
+                    continue
                 if (dtype in ["ALL", "DEF"]) or (device.type == dtype):
                     if (memory is None) or (memory <= device.memory):
                         found = True
@@ -400,7 +415,7 @@ class OpenCL(object):
         return None
 
     def create_context(self, devicetype="ALL", useFp64=False, platformid=None,
-                       deviceid=None, cached=True, memory=None):
+                       deviceid=None, cached=True, memory=None, extensions=None):
         """
         Choose a device and initiate a context.
 
@@ -410,13 +425,20 @@ class OpenCL(object):
         E.g.: If Nvidia driver is installed, GPU will succeed but CPU will fail.
               The AMD SDK kit is required for CPU via OpenCL.
         :param devicetype: string in ["cpu","gpu", "all", "acc"]
-        :param useFp64: boolean specifying if double precision will be used
+        :param useFp64: boolean specifying if double precision will be used: deprecated use extensions=["cl_khr_fp64"]
         :param platformid: integer
         :param deviceid: integer
         :param cached: True if we want to cache the context
         :param memory: minimum amount of memory of the device
+        :param extensions: list of extensions to be present
         :return: OpenCL context on the selected device
         """
+        if extensions is None:
+            extensions = []
+        if useFp64:
+            logger.warning("Deprecation: please select your device using the extension name!, i.e. extensions=['cl_khr_fp64']")
+            extensions.append('cl_khr_fp64')
+
         if (platformid is not None) and (deviceid is not None):
             platformid = int(platformid)
             deviceid = int(deviceid)
@@ -425,20 +447,24 @@ class OpenCL(object):
             pyopencl_ctx += [0] * (2 - len(pyopencl_ctx))  # pad with 0
             platformid, deviceid = pyopencl_ctx
         else:
-            if useFp64:
-                ids = ocl.select_device(type=devicetype, extensions=["cl_khr_int64_base_atomics"])
-            else:
-                ids = ocl.select_device(dtype=devicetype)
+            ids = ocl.select_device(type=devicetype, extensions=extensions)
             if ids:
                 platformid, deviceid = ids
+        ctx = None
         if (platformid is not None) and (deviceid is not None):
             if (platformid, deviceid) in self.context_cache:
                 ctx = self.context_cache[(platformid, deviceid)]
             else:
-                ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
-                if cached:
-                    self.context_cache[(platformid, deviceid)] = ctx
-        else:
+                try:
+                    ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
+                except pyopencl._cl.LogicError as error:
+                    self.platforms[platformid].devices[deviceid].set_unavailable()
+                    logger.warning("Unable to create context on %s/%s: %s", platformid, deviceid, error)
+                    ctx = None
+                else:
+                    if cached:
+                        self.context_cache[(platformid, deviceid)] = ctx
+        if ctx is None:
             logger.warning("Last chance to get an OpenCL device ... probably not the one requested")
             ctx = pyopencl.create_some_context(interactive=False)
         return ctx
diff --git a/silx/opencl/convolution.py b/silx/opencl/convolution.py
new file mode 100644
index 0000000..18232f4
--- /dev/null
+++ b/silx/opencl/convolution.py
@@ -0,0 +1,555 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2019 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for convolution on CPU/GPU."""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["P. Paleo"]
+__license__ = "MIT"
+__date__ = "11/02/2019"
+
+import numpy as np
+from math import ceil
+from copy import copy  # python2
+from .common import pyopencl as cl
+import pyopencl.array as parray
+from .processing import OpenclProcessing, EventDescription
+
+
+class ConvolutionInfos(object):
+    allowed_axes = {
+        "1D": [None],
+        "separable_2D_1D_2D": [None, (0, 1), (1, 0)],
+        "batched_1D_2D": [(0,), (1,)],
+        "separable_3D_1D_3D": [
+            None,
+            (0, 1, 2),
+            (1, 2, 0),
+            (2, 0, 1),
+            (2, 1, 0),
+            (1, 0, 2),
+            (0, 2, 1)
+        ],
+        "batched_1D_3D": [(0,), (1,), (2,)],
+        "batched_separable_2D_1D_3D": [(0,), (1,), (2,)], # unsupported (?)
+        "2D": [None],
+        "batched_2D_3D": [(0,), (1,), (2,)],
+        "separable_3D_2D_3D": [
+            (1, 0),
+            (0, 1),
+            (2, 0),
+            (0, 2),
+            (1, 2),
+            (2, 1),
+        ],
+        "3D": [None],
+    }
+    use_cases = {
+        (1, 1): {
+            "1D": {
+                "name": "1D convolution on 1D data",
+                "kernels": ["convol_1D_X"],
+            },
+        },
+        (2, 2): {
+            "2D": {
+                "name": "2D convolution on 2D data",
+                "kernels": ["convol_2D_XY"],
+            },
+        },
+        (3, 3): {
+            "3D": {
+                "name": "3D convolution on 3D data",
+                "kernels": ["convol_3D_XYZ"],
+            },
+        },
+        (2, 1): {
+            "separable_2D_1D_2D": {
+                "name": "Separable (2D->1D) convolution on 2D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y"],
+            },
+            "batched_1D_2D": {
+                "name": "Batched 1D convolution on 2D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y"],
+            },
+        },
+        (3, 1): {
+            "separable_3D_1D_3D": {
+                "name": "Separable (3D->1D) convolution on 3D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
+            },
+            "batched_1D_3D": {
+                "name": "Batched 1D convolution on 3D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
+            },
+            "batched_separable_2D_1D_3D": {
+                "name": "Batched separable (2D->1D) convolution on 3D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
+            },
+        },
+        (3, 2): {
+            "separable_3D_2D_3D": {
+                "name": "Separable (3D->2D) convolution on 3D data",
+                "kernels": ["convol_2D_XY", "convol_2D_XZ", "convol_2D_YZ"],
+            },
+            "batched_2D_3D": {
+                "name": "Batched 2D convolution on 3D data",
+                "kernels": ["convol_2D_XY", "convol_2D_XZ", "convol_2D_YZ"],
+            },
+        },
+    }
+
+
+class Convolution(OpenclProcessing):
+    """
+    A class for performing convolution on CPU/GPU with OpenCL.
+    """
+
+    def __init__(self, shape, kernel, axes=None, mode=None, ctx=None,
+                 devicetype="all", platformid=None, deviceid=None,
+                 profile=False, extra_options=None):
+        """Constructor of OpenCL Convolution.
+
+        :param shape: shape of the array.
+        :param kernel: convolution kernel (1D, 2D or 3D).
+        :param axes: axes along which the convolution is performed,
+            for batched convolutions.
+        :param mode: Boundary handling mode. Available modes are:
+            "reflect": cba|abcd|dcb
+            "nearest": aaa|abcd|ddd
+            "wrap": bcd|abcd|abc
+            "constant": 000|abcd|000
+            Default is "reflect".
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by
+                           clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param profile: switch on profiling to be able to profile at the kernel
+                        level, store profiling elements (makes code slightly
+                        slower)
+        :param extra_options: Advanced options (dict). Current options are:
+            "allocate_input_array": True,
+            "allocate_output_array": True,
+            "allocate_tmp_array": True,
+            "dont_use_textures": False,
+        """
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  profile=profile)
+
+        self._configure_extra_options(extra_options)
+        self._determine_use_case(shape, kernel, axes)
+        self._allocate_memory(mode)
+        self._init_kernels()
+
+    def _configure_extra_options(self, extra_options):
+        self.extra_options = {
+            "allocate_input_array": True,
+            "allocate_output_array": True,
+            "allocate_tmp_array": True,
+            "dont_use_textures": False,
+        }
+        extra_opts = extra_options or {}
+        self.extra_options.update(extra_opts)
+        self.is_cpu = (self.device.type == "CPU")
+        self.use_textures = not(self.extra_options["dont_use_textures"])
+        self.use_textures *= not(self.is_cpu)
+
+    def _get_dimensions(self, shape, kernel):
+        self.shape = shape
+        self.data_ndim = self._check_dimensions(shape=shape, name="Data")
+        self.kernel_ndim = self._check_dimensions(arr=kernel, name="Kernel")
+        Nx = shape[-1]
+        if self.data_ndim >= 2:
+            Ny = shape[-2]
+        else:
+            Ny = 1
+        if self.data_ndim >= 3:
+            Nz = shape[-3]
+        else:
+            Nz = 1
+        self.Nx = np.int32(Nx)
+        self.Ny = np.int32(Ny)
+        self.Nz = np.int32(Nz)
+
+    def _determine_use_case(self, shape, kernel, axes):
+        """
+        Determine the convolution use case from the input/kernel shape, and axes.
+        """
+        self._get_dimensions(shape, kernel)
+        if self.kernel_ndim > self.data_ndim:
+            raise ValueError("Kernel dimensions cannot exceed data dimensions")
+        data_ndim = self.data_ndim
+        kernel_ndim = self.kernel_ndim
+        self.kernel = kernel.astype("f")
+
+        convol_infos = ConvolutionInfos()
+        k = (data_ndim, kernel_ndim)
+        if k not in convol_infos.use_cases:
+            raise ValueError(
+                "Cannot find a use case for data ndim = %d and kernel ndim = %d"
+                % (data_ndim, kernel_ndim)
+            )
+        possible_use_cases = convol_infos.use_cases[k]
+
+        self.use_case_name = None
+        for uc_name, uc_params in possible_use_cases.items():
+            if axes in convol_infos.allowed_axes[uc_name]:
+                self.use_case_name = uc_name
+                self.use_case_desc = uc_params["name"]
+                #~ self.use_case_kernels = uc_params["kernels"].copy()
+                self.use_case_kernels = copy(uc_params["kernels"]) # TODO use the above line once we get rid of python2
+        if self.use_case_name is None:
+            raise ValueError(
+                "Cannot find a use case for data ndim = %d, kernel ndim = %d and axes=%s"
+                % (data_ndim, kernel_ndim, str(axes))
+            )
+        # TODO implement this use case
+        if self.use_case_name == "batched_separable_2D_1D_3D":
+            raise NotImplementedError(
+                "The use case %s is not implemented"
+                % self.use_case_name
+            )
+        #
+        self.axes = axes
+        # Replace "axes=None" with an actual value (except for ND-ND)
+        allowed_axes = convol_infos.allowed_axes[self.use_case_name]
+        if len(allowed_axes) > 1:
+            # The default choice might impact perfs
+            self.axes = allowed_axes[0] or allowed_axes[1]
+        self.separable = self.use_case_name.startswith("separable")
+        self.batched = self.use_case_name.startswith("batched")
+        # Update kernel names when using textures
+        if self.use_textures:
+            for i, kern_name in enumerate(self.use_case_kernels):
+                self.use_case_kernels[i] = kern_name + "_tex"
+
+    def _allocate_memory(self, mode):
+        self.mode = mode or "reflect"
+        option_array_names = {
+            "allocate_input_array": "data_in",
+            "allocate_output_array": "data_out",
+            "allocate_tmp_array": "data_tmp",
+        }
+        # Nonseparable transforms do not need tmp array
+        if not(self.separable):
+            self.extra_options["allocate_tmp_array"] = False
+        # Allocate arrays
+        for option_name, array_name in option_array_names.items():
+            if self.extra_options[option_name]:
+                value = parray.zeros(self.queue, self.shape, "f")
+            else:
+                value = None
+            setattr(self, array_name, value)
+
+        if isinstance(self.kernel, np.ndarray):
+            self.d_kernel = parray.to_device(self.queue, self.kernel)
+        else:
+            if not(isinstance(self.kernel, parray.Array)):
+                raise ValueError("kernel must be either numpy array or pyopencl array")
+            self.d_kernel = self.kernel
+        self._old_input_ref = None
+        self._old_output_ref = None
+        if self.use_textures:
+            self._allocate_textures()
+        self._c_modes_mapping = {
+            "periodic": 2,
+            "wrap": 2,
+            "nearest": 1,
+            "replicate": 1,
+            "reflect": 0,
+            "constant": 3,
+        }
+        mp = self._c_modes_mapping
+        if self.mode.lower() not in mp:
+            raise ValueError(
+                """
+                Mode %s is not available for textures. Available modes are:
+                %s
+                """
+                % (self.mode, str(mp.keys()))
+            )
+        # TODO
+        if not(self.use_textures) and self.mode.lower() == "constant":
+            raise NotImplementedError(
+                "mode='constant' is not implemented without textures yet"
+            )
+        #
+        self._c_conv_mode = mp[self.mode]
+
+    def _allocate_textures(self):
+        self.data_in_tex = self.allocate_texture(self.shape)
+        self.d_kernel_tex = self.allocate_texture(self.kernel.shape)
+        self.transfer_to_texture(self.d_kernel, self.d_kernel_tex)
+
+    def _init_kernels(self):
+        if self.kernel_ndim > 1:
+            if np.abs(np.diff(self.kernel.shape)).max() > 0:
+                raise NotImplementedError(
+                    "Non-separable convolution with non-square kernels is not implemented yet"
+                )
+        compile_options = [str("-DUSED_CONV_MODE=%d" % self._c_conv_mode)]
+        if self.use_textures:
+            kernel_files = ["convolution_textures.cl"]
+            compile_options.extend([
+                str("-DIMAGE_DIMS=%d" % self.data_ndim),
+                str("-DFILTER_DIMS=%d" % self.kernel_ndim),
+            ])
+            data_in_ref = self.data_in_tex
+            d_kernel_ref = self.d_kernel_tex
+        else:
+            kernel_files = ["convolution.cl"]
+            data_in_ref = self.data_in.data
+            d_kernel_ref = self.d_kernel.data
+        self.compile_kernels(
+            kernel_files=kernel_files,
+            compile_options=compile_options
+        )
+        self.ndrange = self.shape[::-1]
+        self.wg = None
+        kernel_args = [
+            self.queue,
+            self.ndrange, self.wg,
+            data_in_ref,
+            self.data_out.data,
+            d_kernel_ref,
+            np.int32(self.kernel.shape[0]),
+            self.Nx, self.Ny, self.Nz
+        ]
+        if self.kernel_ndim == 2:
+            kernel_args.insert(6, np.int32(self.kernel.shape[1]))
+        if self.kernel_ndim == 3:
+            kernel_args.insert(6, np.int32(self.kernel.shape[2]))
+            kernel_args.insert(7, np.int32(self.kernel.shape[1]))
+        self.kernel_args = tuple(kernel_args)
+        # If self.data_tmp is allocated, separable transforms can be performed
+        # by a series of batched transforms, without any copy, by swapping refs.
+        self.swap_pattern = None
+        if self.separable:
+            if self.data_tmp is not None:
+                self.swap_pattern = {
+                    2: [
+                        ("data_in", "data_tmp"),
+                        ("data_tmp", "data_out")
+                    ],
+                    3: [
+                        ("data_in", "data_out"),
+                        ("data_out", "data_tmp"),
+                        ("data_tmp", "data_out"),
+                    ],
+                }
+            else:
+                # TODO
+                raise NotImplementedError("For now, data_tmp has to be allocated")
+
+    def _get_swapped_arrays(self, i):
+        """
+        Get the input and output arrays to use when using a "swap pattern".
+        Swapping refs enables to avoid copies between temp. array and output.
+        For example, a separable 2D->1D convolution on 2D data reads:
+          data_tmp = convol(data_input, kernel, axis=1) # step i=0
+          data_out = convol(data_tmp, kernel, axis=0) # step i=1
+
+        :param i: current step number of the separable convolution
+        """
+        if self.use_textures:
+            # copy is needed when using texture, as data_out is a Buffer
+            if i > 0:
+                self.transfer_to_texture(self.data_out, self.data_in_tex)
+            return self.data_in_tex, self.data_out
+        n_batchs = len(self.axes)
+        in_ref, out_ref = self.swap_pattern[n_batchs][i]
+        d_in = getattr(self, in_ref)
+        d_out = getattr(self, out_ref)
+        return d_in, d_out
+
+    def _configure_kernel_args(self, opencl_kernel_args, input_ref, output_ref):
+        # TODO more elegant
+        if isinstance(input_ref, parray.Array):
+            input_ref = input_ref.data
+        if isinstance(output_ref, parray.Array):
+            output_ref = output_ref.data
+        if input_ref is not None or output_ref is not None:
+            opencl_kernel_args = list(opencl_kernel_args)
+            if input_ref is not None:
+                opencl_kernel_args[3] = input_ref
+            if output_ref is not None:
+                opencl_kernel_args[4] = output_ref
+            opencl_kernel_args = tuple(opencl_kernel_args)
+        return opencl_kernel_args
+
+    @staticmethod
+    def _check_dimensions(arr=None, shape=None, name="", dim_min=1, dim_max=3):
+        if shape is not None:
+            ndim = len(shape)
+        elif arr is not None:
+            ndim = arr.ndim
+        else:
+            raise ValueError("Please provide either arr= or shape=")
+        if ndim < dim_min or ndim > dim_max:
+            raise ValueError("%s dimensions should be between %d and %d"
+                % (name, dim_min, dim_max)
+            )
+        return ndim
+
+    def _check_array(self, arr):
+        # TODO allow cl.Buffer
+        if not(isinstance(arr, parray.Array) or isinstance(arr, np.ndarray)):
+            raise TypeError("Expected either pyopencl.array.Array or numpy.ndarray")
+        # TODO composition with ImageProcessing/cast
+        if arr.dtype != np.float32:
+            raise TypeError("Data must be float32")
+        if arr.shape != self.shape:
+            raise ValueError("Expected data shape = %s" % str(self.shape))
+
+    def _set_arrays(self, array, output=None):
+        # When using textures: copy
+        if self.use_textures:
+            self.transfer_to_texture(array, self.data_in_tex)
+            data_in_ref = self.data_in_tex
+        else:
+            # Otherwise: copy H->D or update references.
+            if isinstance(array, np.ndarray):
+                self.data_in[:] = array[:]
+            else:
+                self._old_input_ref = self.data_in
+                self.data_in = array
+            data_in_ref = self.data_in
+        if output is not None:
+            if not(isinstance(output, np.ndarray)):
+                self._old_output_ref = self.data_out
+                self.data_out = output
+        # Update OpenCL kernel arguments with new array references
+        self.kernel_args = self._configure_kernel_args(
+            self.kernel_args,
+            data_in_ref,
+            self.data_out
+        )
+
+    def _separable_convolution(self):
+        assert len(self.axes) == len(self.use_case_kernels)
+        # Separable: one kernel call per data dimension
+        for i, axis in enumerate(self.axes):
+            in_ref, out_ref = self._get_swapped_arrays(i)
+            self._batched_convolution(axis, input_ref=in_ref, output_ref=out_ref)
+
+    def _batched_convolution(self, axis, input_ref=None, output_ref=None):
+        # Batched: one kernel call in total
+        opencl_kernel = self.kernels.get_kernel(self.use_case_kernels[axis])
+        opencl_kernel_args = self._configure_kernel_args(
+            self.kernel_args,
+            input_ref,
+            output_ref
+        )
+        ev = opencl_kernel(*opencl_kernel_args)
+        if self.profile:
+            self.events.append(EventDescription("batched convolution", ev))
+
+    def _nd_convolution(self):
+        assert len(self.use_case_kernels) == 1
+        opencl_kernel = self.kernels.get_kernel(self.use_case_kernels[0])
+        ev = opencl_kernel(*self.kernel_args)
+        if self.profile:
+            self.events.append(EventDescription("ND convolution", ev))
+
+    def _recover_arrays_references(self):
+        if self._old_input_ref is not None:
+            self.data_in = self._old_input_ref
+            self._old_input_ref = None
+        if self._old_output_ref is not None:
+            self.data_out = self._old_output_ref
+            self._old_output_ref = None
+        self.kernel_args = self._configure_kernel_args(
+            self.kernel_args,
+            self.data_in,
+            self.data_out
+        )
+
+    def _get_output(self, output):
+        if output is None:
+            res = self.data_out.get()
+        else:
+            res = output
+            if isinstance(output, np.ndarray):
+                output[:] = self.data_out[:]
+        self._recover_arrays_references()
+        return res
+
+    def convolve(self, array, output=None):
+        """
+        Convolve an array with the class kernel.
+
+        :param array: Input array. Can be numpy.ndarray or pyopencl.array.Array.
+        :param output: Output array. Can be numpy.ndarray or pyopencl.array.Array.
+        """
+        self._check_array(array)
+        self._set_arrays(array, output=output)
+        if self.axes is not None:
+            if self.separable:
+                self._separable_convolution()
+            elif self.batched:
+                assert len(self.axes) == 1
+                self._batched_convolution(self.axes[0])
+            # else: ND-ND convol
+        else:
+            # ND-ND convol
+            self._nd_convolution()
+
+        res = self._get_output(output)
+        return res
+
+
+    __call__ = convolve
+
+
+def gaussian_kernel(sigma, cutoff=4, force_odd_size=False):
+    """
+    Generates a Gaussian convolution kernel.
+
+    :param sigma: Standard Deviation of the Gaussian curve.
+    :param cutoff: Parameter tuning the truncation of the Gaussian.
+        The higher cutoff, the biggest the array will be (and the closest to
+        a "true" Gaussian function).
+    :param force_odd_size: when set to True, the resulting array will always
+        have an odd size, regardless of the values of "sigma" and "cutoff".
+    :return: a numpy.ndarray containing the truncated Gaussian function.
+        The array size is 2*c*s+1 where c=cutoff, s=sigma.
+
+    Nota: due to the quick decay of the Gaussian function, small values of the
+        "cutoff" parameter are usually fine. The energy difference between a
+        Gaussian truncated to [-c, c] and a "true" one is
+            erfc(c/(sqrt(2)*s))
+        so choosing cutoff=4*sigma keeps the truncation error below 1e-4.
+    """
+    size = int(ceil(2 * cutoff * sigma + 1))
+    if force_odd_size and size % 2 == 0:
+        size += 1
+    x = np.arange(size) - (size - 1.0) / 2.0
+    g = np.exp(-(x / sigma) ** 2 / 2.0)
+    g /= g.sum()
+    return g
diff --git a/silx/opencl/processing.py b/silx/opencl/processing.py
index 045a9d3..707aa72 100644
--- a/silx/opencl/processing.py
+++ b/silx/opencl/processing.py
@@ -168,7 +168,7 @@ class OpenclProcessing(object):
                         paramatrized buffers.
         :param use_array: allocate memory as pyopencl.array.Array
                             instead of pyopencl.Buffer
-        
+
         Note that an OpenCL context also requires some memory, as well
         as Event and other OpenCL functionalities which cannot and are
         not taken into account here.  The memory required by a context
@@ -290,6 +290,59 @@ class OpenclProcessing(object):
                 else:
                     self.queue = pyopencl.CommandQueue(self.ctx)
 
+    def profile_add(self, event, desc):
+        """
+        Add an OpenCL event to the events lists, if profiling is enabled.
+
+        :param event: silx.opencl.processing.EventDescription.
+        :param desc: event description
+        """
+        if self.profile:
+            self.events.append(EventDescription(desc, event))
+
+    def allocate_texture(self, shape, hostbuf=None, support_1D=False):
+        """
+        Allocate an OpenCL image ("texture").
+
+        :param shape: Shape of the image. Note that pyopencl and OpenCL < 1.2
+            do not support 1D images, so 1D images are handled as 2D with one row
+        :param support_1D: force the image to be 1D if the shape has only one dim
+        """
+        if len(shape) == 1 and not(support_1D):
+            shape = (1,) + shape
+        return pyopencl.Image(
+            self.ctx,
+            pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.USE_HOST_PTR,
+            pyopencl.ImageFormat(
+                pyopencl.channel_order.INTENSITY,
+                pyopencl.channel_type.FLOAT
+            ),
+            hostbuf=numpy.zeros(shape[::-1], dtype=numpy.float32)
+        )
+
+    def transfer_to_texture(self, arr, tex_ref):
+        """
+        Transfer an array to a texture.
+
+        :param arr: Input array. Can be a numpy array or a pyopencl array.
+        :param tex_ref: texture reference (pyopencl._cl.Image).
+        """
+        copy_args = [self.queue, tex_ref, arr]
+        shp = arr.shape
+        ndim = arr.ndim
+        if ndim == 1:
+            # pyopencl and OpenCL < 1.2 do not support image1d_t
+            # force 2D with one row in this case
+            #~ ndim = 2
+            shp = (1,) + shp
+        copy_kwargs = {"origin":(0,) * ndim, "region": shp[::-1]}
+        if not(isinstance(arr, numpy.ndarray)): # assuming pyopencl.array.Array
+            # D->D copy
+            copy_args[2] = arr.data
+            copy_kwargs["offset"] = 0
+        ev = pyopencl.enqueue_copy(*copy_args, **copy_kwargs)
+        self.profile_add(ev, "Transfer to texture")
+
     def log_profile(self):
         """If we are in profiling mode, prints out all timing for every single OpenCL call
         """
@@ -328,7 +381,7 @@ class OpenclProcessing(object):
 
     def get_compiler_options(self, x87_volatile=False):
         """Provide the default OpenCL compiler options
-        
+
         :param x87_volatile: needed for Kahan summation
         :return: string with compiler option
         """
diff --git a/silx/opencl/sinofilter.py b/silx/opencl/sinofilter.py
index 3e4d69c..ad17acb 100644
--- a/silx/opencl/sinofilter.py
+++ b/silx/opencl/sinofilter.py
@@ -29,135 +29,21 @@ from __future__ import absolute_import, print_function, with_statement, division
 
 __authors__ = ["P. Paleo"]
 __license__ = "MIT"
-__date__ = "01/02/2019"
+__date__ = "07/06/2019"
 
 import numpy as np
 from math import pi
-from itertools import product
-from bisect import bisect
 
-from .common import pyopencl as cl
+
 import pyopencl.array as parray
+from .common import pyopencl as cl
 from .processing import OpenclProcessing
 from ..math.fft import FFT
 from ..math.fft.clfft import __have_clfft__
+from ..image.tomography import generate_powers, get_next_power, compute_fourier_filter
 from ..utils.deprecation import deprecated
 
 
-def nextpow2(N):
-    p = 1
-    while p < N:
-        p *= 2
-    return p
-
-def compute_ramlak_filter(dwidth_padded, dtype=np.float32):
-    """
-    Compute the Ramachandran-Lakshminarayanan (Ram-Lak) filter, used in
-    filtered backprojection.
-
-    :param dwidth_padded: width of the 2D sinogram after padding
-    :param dtype: data type
-    """
-    #~ dwidth_padded = dwidth * 2
-    L = dwidth_padded
-    h = np.zeros(L, dtype=dtype)
-    L2 = L//2+1
-    h[0] = 1/4.
-    j = np.linspace(1, L2, L2//2, False).astype(dtype) # np < 1.9.0
-    # h[2::2] = 0
-    h[1:L2:2] = -1./(pi**2 * j**2)
-    # h[-1:L2-1:-2] = -1./(pi**2 * j**2)
-    h[L2:] = np.copy(h[1:L2-1][::-1])
-    return h
-
-
-def tukey(N, alpha=0.5):
-    """
-    Compute the Tukey apodization window.
-
-    :param int N: Number of points.
-    :param float alpha:
-    """
-    apod = np.zeros(N)
-    x = np.arange(N)/(N-1)
-    r = alpha
-    M1 = (0 <= x) * (x < r/2)
-    M2 = (r/2 <= x) * (x <= 1 - r/2)
-    M3 = (1 - r/2 < x) * (x <= 1)
-    apod[M1] = (1 + np.cos(2*pi/r * (x[M1] - r/2)))/2.
-    apod[M2] = 1.
-    apod[M3] = (1 + np.cos(2*pi/r * (x[M3] - 1 + r/2)))/2.
-    return apod
-
-
-def lanczos(N):
-    """
-    Compute the Lanczos window (truncated sinc) of width N.
-
-    :param int N: window width
-    """
-    x = np.arange(N)/(N-1)
-    return np.sin(pi*(2*x-1))/(pi*(2*x-1))
-
-
-def compute_fourier_filter(dwidth_padded, filter_name, cutoff=1.):
-    """
-    Compute the filter used for FBP.
-
-    :param dwidth_padded: padded detector width. As the filtering is done by the
-        Fourier convolution theorem, dwidth_padded should be at least 2*dwidth.
-    :param filter_name: Name of the filter. Available filters are:
-        Ram-Lak, Shepp-Logan, Cosine, Hamming, Hann, Tukey, Lanczos.
-    :param cutoff: Cut-off frequency, if relevant.
-    """
-    Nf = dwidth_padded
-    #~ filt_f = np.abs(np.fft.fftfreq(Nf))
-    rl = compute_ramlak_filter(Nf, dtype=np.float64)
-    filt_f = np.fft.fft(rl)
-
-    filter_name = filter_name.lower()
-    if filter_name in ["ram-lak", "ramlak"]:
-        return filt_f
-
-    w = 2 * pi * np.fft.fftfreq(dwidth_padded)
-    d = cutoff
-    apodization = {
-        # ~OK
-        "shepp-logan": np.sin(w[1:Nf]/(2*d))/(w[1:Nf]/(2*d)),
-        # ~OK
-        "cosine": np.cos(w[1:Nf]/(2*d)),
-        # OK
-        "hamming": 0.54*np.ones_like(filt_f)[1:Nf] + .46 * np.cos(w[1:Nf]/d),
-        # OK
-        "hann": (np.ones_like(filt_f)[1:Nf] + np.cos(w[1:Nf]/d))/2.,
-        # These one is not compatible with Astra - TODO investigate why
-        "tukey": np.fft.fftshift(tukey(dwidth_padded, alpha=d/2.))[1:Nf],
-        "lanczos": np.fft.fftshift(lanczos(dwidth_padded))[1:Nf],
-    }
-    if filter_name not in apodization:
-        raise ValueError("Unknown filter %s. Available filters are %s" %
-                         (filter_name, str(apodization.keys())))
-    filt_f[1:Nf] *= apodization[filter_name]
-    return filt_f
-
-
-def generate_powers():
-    """
-    Generate a list of powers of [2, 3, 5, 7],
-    up to (2**15)*(3**9)*(5**6)*(7**5).
-    """
-    primes = [2, 3, 5, 7]
-    maxpow = {2: 15, 3: 9, 5: 6, 7: 5}
-    valuations = []
-    for prime in primes:
-        valuations.append(range(maxpow[prime]+1))
-    powers = product(*valuations)
-    res = []
-    for pw in powers:
-        res.append(np.prod(list(map(lambda x : x[0]**x[1], zip(primes, pw)))))
-    return np.unique(res)
-
-
 
 class SinoFilter(OpenclProcessing):
     """
@@ -213,10 +99,10 @@ class SinoFilter(OpenclProcessing):
         self.sino_shape = sino_shape
         self.n_angles = n_angles
         self.dwidth = dwidth
-        self.dwidth_padded = self.get_next_power(2*self.dwidth)
+        self.dwidth_padded = get_next_power(2 * self.dwidth, powers=self.powers)
         self.sino_padded_shape = (n_angles, self.dwidth_padded)
         sino_f_shape = list(self.sino_padded_shape)
-        sino_f_shape[-1] = sino_f_shape[-1]//2+1
+        sino_f_shape[-1] = sino_f_shape[-1] // 2 + 1
         self.sino_f_shape = tuple(sino_f_shape)
 
     def _init_extra_options(self, extra_options):
@@ -268,16 +154,6 @@ class SinoFilter(OpenclProcessing):
         self.tmp_sino_device = parray.zeros(self.queue, self.sino_shape, "f")
         self.tmp_sino_host = np.zeros(self.sino_shape, "f")
 
-    def get_next_power(self, n):
-        """
-        Given a number, get the closest (upper) number p such that
-        p is a power of 2, 3, 5 and 7.
-        """
-        idx = bisect(self.powers, n)
-        if self.powers[idx-1] == n:
-            return n
-        return self.powers[idx]
-
     def _compute_filter(self, filter_name):
         """
 
@@ -286,9 +162,9 @@ class SinoFilter(OpenclProcessing):
         self.filter_name = filter_name or "ram-lak"
         filter_f = compute_fourier_filter(
             self.dwidth_padded,
-            filter_name,
+            self.filter_name,
             cutoff=self.extra_options["cutoff"],
-        )[:self.dwidth_padded//2+1]  # R2C
+        )[:self.dwidth_padded // 2 + 1]  # R2C
         self.set_filter(filter_f, normalize=True)
 
     def set_filter(self, h_filt, normalize=True):
@@ -313,22 +189,18 @@ class SinoFilter(OpenclProcessing):
             print("Warning: expected a complex Fourier filter")
         self.filter_f = h_filt
         if normalize:
-            self.filter_f *= pi/self.n_angles
+            self.filter_f *= pi / self.n_angles
         self.filter_f = self.filter_f.astype(np.complex64)
         self.d_filter_f[:] = self.filter_f[:]
 
     def _init_kernels(self):
         OpenclProcessing.compile_kernels(self, self.kernel_files)
         h, w = self.d_sino_f.shape
-        self.mult_kern_args = (
-            self.queue,
-            np.int32(self.d_sino_f.shape[::-1]),
-            None,
-            self.d_sino_f.data,
-            self.d_filter_f.data,
-            np.int32(w),
-            np.int32(h)
-        )
+        self.mult_kern_args = (self.queue, (int(w), (int(h))), None,
+                               self.d_sino_f.data,
+                               self.d_filter_f.data,
+                               np.int32(w),
+                               np.int32(h))
 
     def check_array(self, arr):
         if arr.dtype != np.float32:
@@ -350,18 +222,16 @@ class SinoFilter(OpenclProcessing):
         :param dst_offset:
         :param src_offset:
         """
-        self.kernels.cpy2d(
-            self.queue,
-            np.int32(transfer_shape[::-1]),
-            None,
-            dst.data,
-            src.data,
-            np.int32(dst.shape[1]),
-            np.int32(src.shape[1]),
-            np.int32(dst_offset),
-            np.int32(src_offset),
-            np.int32(transfer_shape[::-1])
-        )
+        shape = tuple(int(i) for i in transfer_shape[::-1])
+        ev = self.kernels.cpy2d(self.queue, shape, None,
+                                dst.data,
+                                src.data,
+                                np.int32(dst.shape[1]),
+                                np.int32(src.shape[1]),
+                                np.int32(dst_offset),
+                                np.int32(src_offset),
+                                np.int32(transfer_shape[::-1]))
+        ev.wait()
 
     def copy2d_host(self, dst, src, transfer_shape, dst_offset=(0, 0),
                     src_offset=(0, 0)):
@@ -376,11 +246,10 @@ class SinoFilter(OpenclProcessing):
         s = transfer_shape
         do = dst_offset
         so = src_offset
-        dst[do[0]:do[0]+s[0], do[1]:do[1]+s[1]] = src[so[0]:so[0]+s[0], so[1]:so[1]+s[1]]
+        dst[do[0]:do[0] + s[0], do[1]:do[1] + s[1]] = src[so[0]:so[0] + s[0], so[1]:so[1] + s[1]]
 
     def _prepare_input_sino(self, sino):
         """
-
         :param sino: sinogram
         """
         self.check_array(sino)
@@ -401,7 +270,7 @@ class SinoFilter(OpenclProcessing):
             # Rectangular copy D->D
             self.copy2d(self.d_sino_padded, d_sino_ref, self.sino_shape)
             if self.is_cpu:
-                self.d_sino_padded.finish()
+                self.d_sino_padded.finish()  # should not be required here
         else:
             # Numpy backend: FFT/mult/IFFT are done on host.
             if not(isinstance(sino, np.ndarray)):
@@ -415,7 +284,6 @@ class SinoFilter(OpenclProcessing):
 
     def _get_output_sino(self, output):
         """
-
         :param Union[numpy.dtype,None] output: sinogram output.
         :return: sinogram
         """
@@ -433,14 +301,14 @@ class SinoFilter(OpenclProcessing):
                             src=self.d_sino_padded,
                             transfer_shape=self.sino_shape)
                 if self.is_cpu:
-                    self.tmp_sino_device.finish()
-                res[:] = self.tmp_sino_device[:]
+                    self.tmp_sino_device.finish()  # should not be required here
+                res[:] = self.tmp_sino_device.get()[:]
             else:
                 if self.is_cpu:
                     self.d_sino_padded.finish()
                 self.copy2d(res, self.d_sino_padded, self.sino_shape)
                 if self.is_cpu:
-                    res.finish()
+                    res.finish()  # should not be required here
         else:
             if not(isinstance(res, np.ndarray)):
                 # Numpy backend + pyopencl output: rect copy H->H + copy H->D
@@ -467,11 +335,12 @@ class SinoFilter(OpenclProcessing):
     def _multiply_fourier(self):
         if self.fft_backend == "opencl":
             # Everything is on device. Call the multiplication kernel.
-            self.kernels.inplace_complex_mul_2Dby1D(
+            ev = self.kernels.inplace_complex_mul_2Dby1D(
                 *self.mult_kern_args
             )
+            ev.wait()
             if self.is_cpu:
-                self.d_sino_f.finish()
+                self.d_sino_f.finish()  # should not be required here
         else:
             # Everything is on host.
             self.d_sino_f *= self.filter_f
@@ -508,7 +377,7 @@ class SinoFilter(OpenclProcessing):
         # return
         res = self._get_output_sino(output)
         return res
-        #~ return output
+        # ~ return output
 
     __call__ = filter_sino
 
@@ -519,6 +388,14 @@ class SinoFilter(OpenclProcessing):
 # - Compatibility  -
 # -------------------
 
+
+def nextpow2(N):
+    p = 1
+    while p < N:
+        p *= 2
+    return p
+
+
 @deprecated(replacement="Backprojection.sino_filter", since_version="0.10")
 def fourier_filter(sino, filter_=None, fft_size=None):
     """Simple np based implementation of fourier space filter.
diff --git a/silx/opencl/sparse.py b/silx/opencl/sparse.py
new file mode 100644
index 0000000..8bfaea8
--- /dev/null
+++ b/silx/opencl/sparse.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2019 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for data sparsification on CPU/GPU."""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["P. Paleo"]
+__license__ = "MIT"
+__date__ = "07/06/2019"
+
+import numpy
+import pyopencl.array as parray
+from collections import namedtuple
+from pyopencl.scan import GenericScanKernel
+from .common import pyopencl as cl
+from .processing import OpenclProcessing, EventDescription, BufferDescription
+mf = cl.mem_flags
+
+
+CSRData = namedtuple("CSRData", ["data", "indices", "indptr"])
+
+def tuple_to_csrdata(arrs):
+    """
+    Converts a 3-tuple to a CSRData namedtuple.
+    """
+    if arrs is None:
+        return None
+    return CSRData(data=arrs[0], indices=arrs[1], indptr=arrs[2])
+
+
+
+# only float32 arrays are supported for now
+class CSR(OpenclProcessing):
+    kernel_files = ["sparse.cl"]
+
+    def __init__(self, shape, max_nnz=None, ctx=None, devicetype="all",
+                 platformid=None, deviceid=None, block_size=None, memory=None,
+                 profile=False):
+        """
+        Compute Compressed Sparse Row format of an image (2D matrix).
+        It is designed to be compatible with scipy.sparse.csr_matrix.
+
+        :param shape: tuple
+            Matrix shape.
+        :param max_nnz: int, optional
+            Maximum number of non-zero elements. By default, the arrays "data"
+            and "indices" are allocated with prod(shape) elements, but
+            in practice a much lesser space is needed.
+            The number of non-zero items cannot be known in advance, but one can
+            estimate an upper-bound with this parameter to save memory.
+
+        Opencl processing parameters
+        -----------------------------
+        Please refer to the documentation of silx.opencl.processing.OpenclProcessing
+        for information on the other parameters.
+        """
+
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  profile=profile)
+        self._set_parameters(shape, max_nnz)
+        self._allocate_memory()
+        self._setup_kernels()
+
+    # --------------------------------------------------------------------------
+    # -------------------------- Initialization --------------------------------
+    # --------------------------------------------------------------------------
+
+    def _set_parameters(self, shape, max_nnz):
+        self.shape = shape
+        self.size = numpy.prod(shape)
+        self.indice_dtype = numpy.int32 #
+        assert len(shape) == 2 #
+        if max_nnz is None:
+            self.max_nnz = numpy.prod(shape) # worst case
+        else:
+            self.max_nnz = int(max_nnz)
+
+
+    def _allocate_memory(self):
+        self.is_cpu = (self.device.type == "CPU") # move to OpenclProcessing ?
+        self.buffers = [
+            BufferDescription("array", (self.size,), numpy.float32, mf.READ_ONLY),
+            BufferDescription("data", (self.max_nnz,), numpy.float32, mf.READ_WRITE),
+            BufferDescription("indices", (self.max_nnz,), self.indice_dtype, mf.READ_WRITE),
+            BufferDescription("indptr", (self.shape[0]+1,), self.indice_dtype, mf.READ_WRITE),
+        ]
+        self.allocate_buffers(use_array=True)
+        for arr_name in ["array", "data", "indices", "indptr"]:
+            setattr(self, arr_name, self.cl_mem[arr_name])
+            self.cl_mem[arr_name].fill(0) # allocate_buffers() uses empty()
+        self._old_array = self.array
+        self._old_data = self.data
+        self._old_indices = self.indices
+        self._old_indptr = self.indptr
+
+
+    def _setup_kernels(self):
+        self._setup_compaction_kernel()
+        self._setup_decompaction_kernel()
+
+
+    def _setup_compaction_kernel(self):
+        self.scan_kernel = GenericScanKernel(
+            self.ctx, self.indice_dtype,
+            arguments="__global float* data, __global float *data_compacted, __global int *indices, __global int* indptr",
+            input_expr="(fabs(data[i]) > 0.0f) ? 1 : 0",
+            scan_expr="a+b", neutral="0",
+            output_statement="""
+                // item is the running sum of input_expr(i), i.e the cumsum of "nonzero"
+                if (prev_item != item) {
+                    data_compacted[item-1] = data[i];
+                    indices[item-1] = GET_INDEX(i);
+                }
+                // The last cumsum element of each line of "nonzero" goes to inptr[i]
+                if ((i+1) % IMAGE_WIDTH == 0) {
+                    indptr[(i/IMAGE_WIDTH)+1] = item;
+                }
+                """,
+            options="-DIMAGE_WIDTH=%d" % self.shape[1],
+            preamble="#define GET_INDEX(i) (i % IMAGE_WIDTH)",
+        )
+
+
+    def _setup_decompaction_kernel(self):
+        OpenclProcessing.compile_kernels(
+            self,
+            self.kernel_files,
+            compile_options=["-DIMAGE_WIDTH=%d" % self.shape[1]]
+        )
+        device = self.ctx.devices[0]
+        wg_x = min(
+            device.max_work_group_size,
+            32,
+            self.kernels.max_workgroup_size("densify_csr")
+        )
+        self._decomp_wg = (wg_x, 1)
+        self._decomp_grid = (self._decomp_wg[0], self.shape[0])
+
+
+    # --------------------------------------------------------------------------
+    # -------------------------- Array utils -----------------------------------
+    # --------------------------------------------------------------------------
+
+    # TODO handle pyopencl Buffer
+    def check_array(self, arr):
+        """
+        Check that provided array is compatible with current context.
+
+        :param arr: numpy.ndarray or pyopencl.array.Array
+            2D array in dense format.
+        """
+        assert arr.size == self.size
+        assert arr.dtype == numpy.float32
+
+
+    # TODO handle pyopencl Buffer
+    def check_sparse_arrays(self, csr_data):
+        """
+        Check that the provided sparse arrays are compatible with the current
+        context.
+
+        :param arrays: namedtuple CSRData.
+            It contains the arrays "data", "indices", "indptr"
+        """
+        assert isinstance(csr_data, CSRData)
+        for arr in [csr_data.data, csr_data.indices, csr_data.indptr]:
+            assert arr.ndim == 1
+        assert csr_data.data.size == self.max_nnz
+        assert csr_data.indices.size == self.max_nnz
+        assert csr_data.indptr.size == self.shape[0]+1
+        assert csr_data.data.dtype == numpy.float32
+        assert csr_data.indices.dtype == self.indice_dtype
+        assert csr_data.indptr.dtype == self.indice_dtype
+
+
+    def set_array(self, arr):
+        """
+        Set the provided array as the current context 2D matrix.
+
+        :param arr: numpy.ndarray or pyopencl.array.Array
+            2D array in dense format.
+        """
+        if arr is None:
+            return
+        self.check_array(arr)
+        # GenericScanKernel only supports 1D data
+        if isinstance(arr, parray.Array):
+            self._old_array = self.array
+            self.array = arr
+        elif isinstance(arr, numpy.ndarray):
+            self.array[:] = arr.ravel()[:]
+        else:
+            raise ValueError("Expected pyopencl array or numpy array")
+
+
+    def set_sparse_arrays(self, csr_data):
+        if csr_data is None:
+            return
+        self.check_sparse_arrays(csr_data)
+        for name, arr in {"data": csr_data.data, "indices": csr_data.indices, "indptr": csr_data.indptr}.items():
+            # The current array is a device array. Don't copy, use it directly
+            if isinstance(arr, parray.Array):
+                setattr(self, "_old_" + name, getattr(self, name))
+                setattr(self, name, arr)
+            # The current array is a numpy.ndarray: copy H2D
+            elif isinstance(arr, numpy.ndarray):
+                getattr(self, name)[:] = arr[:]
+            else:
+                raise ValueError("Unsupported array type: %s" % type(arr))
+
+
+    def _recover_arrays_references(self):
+        """
+        Recover the previous arrays references, and return the references of the
+        "current" arrays.
+        """
+        array = self.array
+        data = self.data
+        indices = self.indices
+        indptr = self.indptr
+        for name in ["array", "data", "indices", "indptr"]:
+            # self.X = self._old_X
+            setattr(self, name, getattr(self, "_old_" + name))
+        return array, (data, indices, indptr)
+
+
+    def get_sparse_arrays(self, output):
+        """
+        Get the 2D dense array of the current context.
+
+        :param output: tuple or None
+            tuple in the form (data, indices, indptr). These arrays have to be
+            compatible with the current context (size and data type).
+            The content of these arrays will be overwritten with the result of
+            the previous computation.
+        """
+        numels = self.max_nnz
+        if output is None:
+            data = self.data.get()[:numels]
+            ind = self.indices.get()[:numels]
+            indptr = self.indptr.get()
+            res = (data, ind, indptr)
+        else:
+            res = output
+        return res
+
+
+    def get_array(self, output):
+        if output is None:
+            res = self.array.get().reshape(self.shape)
+        else:
+            res = output
+        return res
+
+    # --------------------------------------------------------------------------
+    # -------------------------- Compaction ------------------------------------
+    # --------------------------------------------------------------------------
+
+    def sparsify(self, arr, output=None):
+        """
+        Convert an image (2D matrix) into a CSR representation.
+
+        :param arr: numpy.ndarray or pyopencl.array.Array
+            Input array.
+        :param output: tuple of pyopencl.array.Array, optional
+            If provided, this must be a tuple of 3 arrays (data, indices, indptr).
+            The content of each array is overwritten by the computation result.
+        """
+        self.set_array(arr)
+        self.set_sparse_arrays(tuple_to_csrdata(output))
+        evt = self.scan_kernel(
+            self.array,
+            self.data,
+            self.indices,
+            self.indptr,
+        )
+        #~ evt.wait()
+        self.profile_add(evt, "sparsification kernel")
+        res = self.get_sparse_arrays(output)
+        self._recover_arrays_references()
+        return res
+
+    # --------------------------------------------------------------------------
+    # -------------------------- Decompaction ----------------------------------
+    # --------------------------------------------------------------------------
+
+    def densify(self, data, indices, indptr, output=None):
+        self.set_sparse_arrays(
+            CSRData(data=data, indices=indices, indptr=indptr)
+        )
+        self.set_array(output)
+        evt = self.kernels.densify_csr(
+            self.queue,
+            self._decomp_grid,
+            self._decomp_wg,
+            self.data.data,
+            self.indices.data,
+            self.indptr.data,
+            self.array.data,
+            numpy.int32(self.shape[0]),
+        )
+        #~ evt.wait()
+        self.profile_add(evt, "desparsification kernel")
+        res = self.get_array(output)
+        self._recover_arrays_references()
+        return res
+
diff --git a/silx/opencl/test/__init__.py b/silx/opencl/test/__init__.py
index f3121d5..2e90e66 100644
--- a/silx/opencl/test/__init__.py
+++ b/silx/opencl/test/__init__.py
@@ -38,6 +38,8 @@ from ..codec import test as test_codec
 from . import test_image
 from . import test_kahan
 from . import test_stats
+from . import test_convolution
+from . import test_sparse
 
 
 def suite():
@@ -52,6 +54,8 @@ def suite():
     test_suite.addTests(test_image.suite())
     test_suite.addTests(test_kahan.suite())
     test_suite.addTests(test_stats.suite())
+    test_suite.addTests(test_convolution.suite())
+    test_suite.addTests(test_sparse.suite())
     # Allow to remove sift from the project
     test_base_dir = os.path.dirname(__file__)
     sift_dir = os.path.join(test_base_dir, "..", "sift")
diff --git a/silx/opencl/test/test_backprojection.py b/silx/opencl/test/test_backprojection.py
index 7ab416d..b2f2070 100644
--- a/silx/opencl/test/test_backprojection.py
+++ b/silx/opencl/test/test_backprojection.py
@@ -45,7 +45,7 @@ except ImportError:
 from ..common import ocl
 if ocl:
     from .. import backprojection
-    from ..sinofilter import compute_fourier_filter
+    from ...image.tomography import compute_fourier_filter
 from silx.test.utils import utilstest
 
 logger = logging.getLogger(__name__)
@@ -199,11 +199,30 @@ class TestFBP(unittest.TestCase):
                 "Something wrong with FBP(filter=%s)" % filter_name
             )
 
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_fbp_oddsize(self):
+        # Generate a 513-sinogram.
+        # The padded width will be nextpow(513*2).
+        # silx [0.10, 0.10.1] will give 1029, which makes R2C transform fail.
+        sino = np.pad(self.sino, ((0, 0), (1, 0)), mode='edge')
+        B = backprojection.Backprojection(sino.shape, axis_position=self.fbp.axis_pos+1)
+        res = B(sino)
+        # Compare with self.reference_rec. Tolerance is high as backprojector
+        # is not fully shift-invariant.
+        errmax = np.max(np.abs(clip_circle(res[1:, 1:] - self.reference_rec)))
+        self.assertLess(
+            errmax, 1.e-1,
+            "Something wrong with FBP on odd-sized sinogram"
+        )
+
+
+
 
 def suite():
     testSuite = unittest.TestSuite()
     testSuite.addTest(TestFBP("test_fbp"))
     testSuite.addTest(TestFBP("test_fbp_filters"))
+    testSuite.addTest(TestFBP("test_fbp_oddsize"))
     return testSuite
 
 
diff --git a/silx/opencl/test/test_convolution.py b/silx/opencl/test/test_convolution.py
new file mode 100644
index 0000000..8acd385
--- /dev/null
+++ b/silx/opencl/test/test_convolution.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2019 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+
+"""
+Test of the Convolution class.
+"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Pierre Paleo"]
+__contact__ = "pierre.paleo@esrf.fr"
+__license__ = "MIT"
+__copyright__ = "2019 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "15/02/2019"
+
+import logging
+from itertools import product
+import numpy as np
+from silx.utils.testutils import parameterize
+try:
+    from scipy.ndimage import convolve, convolve1d
+    from scipy.misc import ascent
+    scipy_convolve = convolve
+    scipy_convolve1d = convolve1d
+except ImportError:
+    scipy_convolve = None
+import unittest
+from ..common import ocl
+if ocl:
+    import pyopencl as cl
+    import pyopencl.array as parray
+    from ..convolution import Convolution, gaussian_kernel
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipUnless(ocl and scipy_convolve, "PyOpenCl/scipy is missing")
+class TestConvolution(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestConvolution, cls).setUpClass()
+        cls.image = np.ascontiguousarray(ascent()[:, :511], dtype="f")
+        cls.data1d = cls.image[0]
+        cls.data2d = cls.image
+        cls.data3d = np.tile(cls.image[224:-224, 224:-224], (62, 1, 1))
+        cls.kernel1d = gaussian_kernel(1.)
+        cls.kernel2d = np.outer(cls.kernel1d, cls.kernel1d)
+        cls.kernel3d = np.multiply.outer(cls.kernel2d, cls.kernel1d)
+        cls.ctx = ocl.create_context()
+        cls.tol = {
+            "1D": 1e-4,
+            "2D": 1e-3,
+            "3D": 1e-3,
+        }
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.data1d = cls.data2d = cls.data3d = cls.image = None
+        cls.kernel1d = cls.kernel2d = cls.kernel3d = None
+
+    @staticmethod
+    def compare(arr1, arr2):
+        return np.max(np.abs(arr1 - arr2))
+
+    @staticmethod
+    def print_err(conv):
+        errmsg = str(
+            """
+            Something wrong with %s
+            mode=%s, texture=%s
+            """
+            % (conv.use_case_desc, conv.mode, conv.use_textures)
+        )
+        return errmsg
+
+    def __init__(self, methodName='runTest', param=None):
+        unittest.TestCase.__init__(self, methodName)
+        self.param = param
+        self.mode = param["boundary_handling"]
+        logger.debug(
+            """
+            Testing convolution with boundary_handling=%s,
+            use_textures=%s, input_device=%s, output_device=%s
+            """
+            % (
+                self.mode, param["use_textures"],
+                param["input_on_device"], param["output_on_device"]
+            )
+        )
+
+    def instantiate_convol(self, shape, kernel, axes=None):
+        if (self.mode == "constant") and (
+            not(self.param["use_textures"])
+                or (self.ctx.devices[0].type == cl._cl.device_type.CPU)
+            ):
+                self.skipTest("mode=constant not implemented without textures")
+        C = Convolution(
+            shape, kernel,
+            mode=self.mode,
+            ctx=self.ctx,
+            axes=axes,
+            extra_options={"dont_use_textures": not(self.param["use_textures"])}
+        )
+        return C
+
+    def get_data_and_kernel(self, test_name):
+        dims = {
+            "test_1D": (1, 1),
+            "test_separable_2D": (2, 1),
+            "test_separable_3D": (3, 1),
+            "test_nonseparable_2D": (2, 2),
+            "test_nonseparable_3D":  (3, 3),
+        }
+        dim_data = {
+            1: self.data1d,
+            2: self.data2d,
+            3: self.data3d
+        }
+        dim_kernel = {
+            1: self.kernel1d,
+            2: self.kernel2d,
+            3: self.kernel3d,
+        }
+        dd, kd = dims[test_name]
+        return dim_data[dd], dim_kernel[kd]
+
+    def get_reference_function(self, test_name):
+        ref_func = {
+            "test_1D":
+                lambda x, y : scipy_convolve1d(x, y, mode=self.mode),
+            "test_separable_2D":
+                lambda x, y : scipy_convolve1d(
+                    scipy_convolve1d(x, y, mode=self.mode, axis=1),
+                    y, mode=self.mode, axis=0
+                ),
+            "test_separable_3D":
+                lambda x, y: scipy_convolve1d(
+                    scipy_convolve1d(
+                        scipy_convolve1d(x, y, mode=self.mode, axis=2),
+                        y, mode=self.mode, axis=1),
+                    y, mode=self.mode, axis=0
+                ),
+            "test_nonseparable_2D":
+                lambda x, y: scipy_convolve(x, y, mode=self.mode),
+            "test_nonseparable_3D":
+                lambda x, y : scipy_convolve(x, y, mode=self.mode),
+        }
+        return ref_func[test_name]
+
+    def template_test(self, test_name):
+        data, kernel = self.get_data_and_kernel(test_name)
+        conv = self.instantiate_convol(data.shape, kernel)
+        if self.param["input_on_device"]:
+            data_ref = parray.to_device(conv.queue, data)
+        else:
+            data_ref = data
+        if self.param["output_on_device"]:
+            d_res = parray.zeros_like(conv.data_out)
+            res = d_res
+        else:
+            res = None
+        res = conv(data_ref, output=res)
+        if self.param["output_on_device"]:
+            res = res.get()
+        ref_func = self.get_reference_function(test_name)
+        ref = ref_func(data, kernel)
+        metric = self.compare(res, ref)
+        logger.info("%s: max error = %.2e" % (test_name, metric))
+        tol = self.tol[str("%dD" % kernel.ndim)]
+        self.assertLess(metric, tol, self.print_err(conv))
+
+    def test_1D(self):
+        self.template_test("test_1D")
+
+    def test_separable_2D(self):
+        self.template_test("test_separable_2D")
+
+    def test_separable_3D(self):
+        self.template_test("test_separable_3D")
+
+    def test_nonseparable_2D(self):
+        self.template_test("test_nonseparable_2D")
+
+    def test_nonseparable_3D(self):
+        self.template_test("test_nonseparable_3D")
+
+    def test_batched_2D(self):
+        """
+        Test batched (nonseparable) 2D convolution on 3D data.
+        In this test: batch along "z" (axis 0)
+        """
+        data = self.data3d
+        kernel = self.kernel2d
+        conv = self.instantiate_convol(data.shape, kernel, axes=(0,))
+        res = conv(data) # 3D
+        ref = scipy_convolve(data[0], kernel, mode=self.mode) # 2D
+
+        std = np.std(res, axis=0)
+        std_max = np.max(np.abs(std))
+        self.assertLess(std_max, self.tol["2D"], self.print_err(conv))
+        metric = self.compare(res[0], ref)
+        logger.info("test_nonseparable_3D: max error = %.2e" % metric)
+        self.assertLess(metric, self.tol["2D"], self.print_err(conv))
+
+
+def test_convolution():
+    boundary_handling_ = ["reflect", "nearest", "wrap", "constant"]
+    use_textures_ = [True, False]
+    input_on_device_ = [True, False]
+    output_on_device_ = [True, False]
+    testSuite = unittest.TestSuite()
+
+    param_vals = list(product(
+        boundary_handling_,
+        use_textures_,
+        input_on_device_,
+        output_on_device_
+    ))
+    for boundary_handling, use_textures, input_dev, output_dev in param_vals:
+        testcase = parameterize(
+            TestConvolution,
+            param={
+                "boundary_handling": boundary_handling,
+                "input_on_device": input_dev,
+                "output_on_device": output_dev,
+                "use_textures": use_textures,
+            }
+        )
+        testSuite.addTest(testcase)
+    return testSuite
+
+
+
+def suite():
+    testSuite = test_convolution()
+    return testSuite
+
+
+if __name__ == '__main__':
+    unittest.main(defaultTest="suite")
diff --git a/silx/opencl/test/test_sparse.py b/silx/opencl/test/test_sparse.py
new file mode 100644
index 0000000..56f1ba4
--- /dev/null
+++ b/silx/opencl/test/test_sparse.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2018-2019 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Test of the sparse module"""
+
+import numpy as np
+import unittest
+import logging
+from itertools import product
+from ..common import ocl
+if ocl:
+    import pyopencl.array as parray
+    from silx.opencl.sparse import CSR
+try:
+    import scipy.sparse as sp
+except ImportError:
+    sp = None
+logger = logging.getLogger(__name__)
+
+
+
+def generate_sparse_random_data(
+    shape=(1000,),
+    data_min=0, data_max=100,
+    density=0.1,
+    use_only_integers=True,
+    dtype="f"):
+    """
+    Generate random sparse data where.
+
+    Parameters
+    ------------
+    shape: tuple
+        Output data shape.
+    data_min: int or float
+        Minimum value of data
+    data_max: int or float
+        Maximum value of data
+    density: float
+        Density of non-zero elements in the output data.
+        Low value of density mean low number of non-zero elements.
+    use_only_integers: bool
+        If set to True, the output data items will be primarily integers,
+        possibly casted to float if dtype is a floating-point type.
+        This can be used for ease of debugging.
+    dtype: str or numpy.dtype
+        Output data type
+    """
+    mask = np.random.binomial(1, density, size=shape)
+    if use_only_integers:
+        d = np.random.randint(data_min, high=data_max, size=shape)
+    else:
+        d = data_min + (data_max - data_min) * np.random.rand(*shape)
+    return (d * mask).astype(dtype)
+
+
+
+@unittest.skipUnless(ocl and sp, "PyOpenCl/scipy is missing")
+class TestCSR(unittest.TestCase):
+    """Test CSR format"""
+
+    def setUp(self):
+        self.array = generate_sparse_random_data(shape=(512, 511))
+        # Compute reference sparsification
+        a_s = sp.csr_matrix(self.array)
+        self.ref_data = a_s.data
+        self.ref_indices = a_s.indices
+        self.ref_indptr = a_s.indptr
+        self.ref_nnz = a_s.nnz
+        # Test possible configurations
+        input_on_device = [False, True]
+        output_on_device = [False, True]
+        self._test_configs = list(product(input_on_device, output_on_device))
+
+
+    def test_sparsification(self):
+        for input_on_device, output_on_device in self._test_configs:
+            self._test_sparsification(input_on_device, output_on_device)
+
+
+    def _test_sparsification(self, input_on_device, output_on_device):
+        current_config = "input on device: %s, output on device: %s" % (
+            str(input_on_device), str(output_on_device)
+        )
+        # Sparsify on device
+        csr = CSR(self.array.shape)
+        if input_on_device:
+            # The array has to be flattened
+            arr = parray.to_device(csr.queue, self.array.ravel())
+        else:
+            arr = self.array
+        if output_on_device:
+            d_data = parray.zeros_like(csr.data)
+            d_indices = parray.zeros_like(csr.indices)
+            d_indptr = parray.zeros_like(csr.indptr)
+            output = (d_data, d_indices, d_indptr)
+        else:
+            output = None
+        data, indices, indptr = csr.sparsify(arr, output=output)
+        if output_on_device:
+            data = data.get()
+            indices = indices.get()
+            indptr = indptr.get()
+        # Compare
+        nnz = self.ref_nnz
+        self.assertTrue(
+            np.allclose(data[:nnz], self.ref_data),
+            "something wrong with sparsified data (%s)"
+            % current_config
+        )
+        self.assertTrue(
+            np.allclose(indices[:nnz], self.ref_indices),
+            "something wrong with sparsified indices (%s)"
+            % current_config
+        )
+        self.assertTrue(
+            np.allclose(indptr, self.ref_indptr),
+            "something wrong with sparsified indices pointers (indptr) (%s)"
+            % current_config
+        )
+
+
+    def test_desparsification(self):
+        for input_on_device, output_on_device in self._test_configs:
+            self._test_desparsification(input_on_device, output_on_device)
+
+
+    def _test_desparsification(self, input_on_device, output_on_device):
+        current_config = "input on device: %s, output on device: %s" % (
+            str(input_on_device), str(output_on_device)
+        )
+        # De-sparsify on device
+        csr = CSR(self.array.shape, max_nnz=self.ref_nnz)
+        if input_on_device:
+            data = parray.to_device(csr.queue, self.ref_data)
+            indices = parray.to_device(csr.queue, self.ref_indices)
+            indptr = parray.to_device(csr.queue, self.ref_indptr)
+        else:
+            data = self.ref_data
+            indices = self.ref_indices
+            indptr = self.ref_indptr
+        if output_on_device:
+            d_arr = parray.zeros_like(csr.array)
+            output = d_arr
+        else:
+            output = None
+        arr = csr.densify(data, indices, indptr, output=output)
+        if output_on_device:
+            arr = arr.get()
+        # Compare
+        self.assertTrue(
+            np.allclose(arr.reshape(self.array.shape), self.array),
+            "something wrong with densified data (%s)"
+            % current_config
+        )
+
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(
+        unittest.defaultTestLoader.loadTestsFromTestCase(TestCSR)
+    )
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main(defaultTest="suite")
+
+