34 files changed, 8069 insertions, 0 deletions
diff --git a/src/silx/opencl/__init__.py b/src/silx/opencl/__init__.py
new file mode 100644
index 0000000..fbd1f88
--- /dev/null
+++ b/src/silx/opencl/__init__.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: S I L X project
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2018 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+"""This package provides OpenCl-based optimized processing functions.
+
+For more processing functions, see the silx.math and silx.image packages.
+
+See silx documentation: http://www.silx.org/doc/silx/latest/
+"""
+
+__author__ = "Jerome Kieffer"
+__contact__ = "Jerome.Kieffer@ESRF.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "15/03/2017"
+__status__ = "stable"
+
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+from .common import *
diff --git a/src/silx/opencl/backprojection.py b/src/silx/opencl/backprojection.py
new file mode 100644
index 0000000..65a9836
--- /dev/null
+++ b/src/silx/opencl/backprojection.py
@@ -0,0 +1,397 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for (filtered) backprojection on the GPU"""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["A. Mirone, P. Paleo"]
+__license__ = "MIT"
+__date__ = "25/01/2019"
+
+import logging
+import numpy as np
+
+from .common import pyopencl
+from .processing import EventDescription, OpenclProcessing, BufferDescription
+from .sinofilter import SinoFilter
+from .sinofilter import fourier_filter as fourier_filter_
+from ..utils.deprecation import deprecated
+
+if pyopencl:
+    mf = pyopencl.mem_flags
+    import pyopencl.array as parray
+else:
+    raise ImportError("Please install pyopencl in order to use opencl backprojection")
+logger = logging.getLogger(__name__)
+
+
+def _sizeof(Type):
+    """
+    return the size (in bytes) of a scalar type, like the C behavior
+    """
+    return np.dtype(Type).itemsize
+
+
+def _idivup(a, b):
+    """
+    return the integer division, plus one if `a` is not a multiple of `b`
+    """
+    return (a + (b - 1)) // b
+
+
+class Backprojection(OpenclProcessing):
+    """A class for performing the backprojection using OpenCL"""
+    kernel_files = ["backproj.cl", "array_utils.cl"]
+
+    def __init__(self, sino_shape, slice_shape=None, axis_position=None,
+                 angles=None, filter_name=None, ctx=None, devicetype="all",
+                 platformid=None, deviceid=None, profile=False,
+                 extra_options=None):
+        """Constructor of the OpenCL (filtered) backprojection
+
+        :param sino_shape: shape of the sinogram. The sinogram is in the format
+                           (n_b, n_a) where n_b is the number of detector bins
+                           and n_a is the number of angles.
+        :param slice_shape: Optional, shape of the reconstructed slice. By
+                            default, it is a square slice where the dimension
+                            is the "x dimension" of the sinogram (number of
+                            bins).
+        :param axis_position: Optional, axis position. Default is
+                              `(shape[1]-1)/2.0`.
+        :param angles: Optional, a list of custom angles in radian.
+        :param filter_name: Optional, name of the filter for FBP. Default is
+                            the Ram-Lak filter.
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by
+                           clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param profile: switch on profiling to be able to profile at the kernel
+                        level, store profiling elements (makes code slightly
+                        slower)
+        :param extra_options: Advanced extra options in the form of a dict.
+            Current options are: cutoff, use_numpy_fft
+        """
+        # OS X enforces a workgroup size of 1 when the kernel has
+        # synchronization barriers if sys.platform.startswith('darwin'):
+        #  assuming no discrete GPU
+        #    raise NotImplementedError("Backprojection is not implemented on CPU for OS X yet")
+
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  profile=profile)
+
+        self._init_geometry(sino_shape, slice_shape, angles, axis_position,
+                           extra_options)
+        self._allocate_memory()
+        self._compute_angles()
+        self._init_kernels()
+        self._init_filter(filter_name)
+
+    def _init_geometry(self, sino_shape, slice_shape, angles, axis_position,
+                      extra_options):
+        """Geometry Initialization
+
+        :param sino_shape: shape of the sinogram. The sinogram is in the format
+                           (n_b, n_a) where n_b is the number of detector bins
+                           and n_a is the number of angles.
+        :param slice_shape: shape of the reconstructed slice. By
+                            default, it is a square slice where the dimension
+                            is the "x dimension" of the sinogram (number of
+                            bins).
+        :param angles: list of projection angles in radian.
+        :param axis_position: axis position
+        :param dict extra_options: Advanced extra options
+        """
+        self.shape = sino_shape
+        self.num_bins = np.int32(sino_shape[1])
+        self.num_projs = np.int32(sino_shape[0])
+        self.angles = angles
+        if slice_shape is None:
+            self.slice_shape = (self.num_bins, self.num_bins)
+        else:
+            self.slice_shape = slice_shape
+        self.dimrec_shape = (
+            _idivup(self.slice_shape[0], 32) * 32,
+            _idivup(self.slice_shape[1], 32) * 32
+        )
+        if axis_position:
+            self.axis_pos = np.float32(axis_position)
+        else:
+            self.axis_pos = np.float32((sino_shape[1] - 1.) / 2)
+        self.axis_array = None  # TODO: add axis correction front-end
+        self._init_extra_options(extra_options)
+
+    def _init_extra_options(self, extra_options):
+        """Backprojection extra option initialization
+
+        :param dict extra_options: Advanced extra options
+        """
+        self.extra_options = {
+            "cutoff": 1.,
+            "use_numpy_fft": False,
+            # It is  axis_pos - (num_bins-1)/2  in PyHST
+            "gpu_offset_x": 0., #self.axis_pos - (self.num_bins - 1) / 2.,
+            "gpu_offset_y": 0., #self.axis_pos - (self.num_bins - 1) / 2.
+        }
+        if extra_options is not None:
+            self.extra_options.update(extra_options)
+
+    def _allocate_memory(self):
+        # Host memory
+        self.slice = np.zeros(self.dimrec_shape, dtype=np.float32)
+        self._use_textures = self.check_textures_availability()
+
+        # Device memory
+        self.buffers = [
+            BufferDescription("_d_slice", self.dimrec_shape, np.float32, mf.READ_WRITE),
+            BufferDescription("d_sino", self.shape, np.float32, mf.READ_WRITE),  # before transferring to texture (if available)
+            BufferDescription("d_cos", (self.num_projs,), np.float32, mf.READ_ONLY),
+            BufferDescription("d_sin", (self.num_projs,), np.float32, mf.READ_ONLY),
+            BufferDescription("d_axes", (self.num_projs,), np.float32, mf.READ_ONLY),
+        ]
+        self.allocate_buffers(use_array=True)
+        self.d_sino = self.cl_mem["d_sino"]  # shorthand
+
+        # Texture memory (if relevant)
+        if self._use_textures:
+            self._allocate_textures()
+
+        # Local memory
+        self.local_mem = 256 * 3 * _sizeof(np.float32)  # constant for all image sizes
+
+    def _compute_angles(self):
+        if self.angles is None:
+            self.angles = np.linspace(0, np.pi, self.num_projs, False)
+        h_cos = np.cos(self.angles).astype(np.float32)
+        h_sin = np.sin(self.angles).astype(np.float32)
+        self.cl_mem["d_cos"][:] = h_cos[:]
+        self.cl_mem["d_sin"][:] = h_sin[:]
+        if self.axis_array:
+            self.cl_mem["d_axes"][:] = self.axis_array.astype(np.float32)[:]
+        else:
+            self.cl_mem["d_axes"][:] = np.ones(self.num_projs, dtype="f") * self.axis_pos
+
+    def _init_kernels(self):
+        compile_options = None
+        if not(self._use_textures):
+            compile_options = "-DDONT_USE_TEXTURES"
+        OpenclProcessing.compile_kernels(
+            self,
+            self.kernel_files,
+            compile_options=compile_options
+        )
+        # check that workgroup can actually be (16, 16)
+        self.compiletime_workgroup_size = self.kernels.max_workgroup_size("backproj_cpu_kernel")
+        # Workgroup and ndrange sizes are always the same
+        self.wg = (16, 16)
+        self.ndrange = (
+            _idivup(int(self.dimrec_shape[1]), 32) * self.wg[0],
+            _idivup(int(self.dimrec_shape[0]), 32) * self.wg[1]
+        )
+        # Prepare arguments for the kernel call
+        if not(self._use_textures):
+            d_sino_ref = self.d_sino.data
+        else:
+            d_sino_ref = self.d_sino_tex
+        self._backproj_kernel_args = (
+            # num of projections (int32)
+            self.num_projs,
+            # num of bins (int32)
+            self.num_bins,
+            # axis position (float32)
+            self.axis_pos,
+            # d_slice (__global float32*)
+            self.cl_mem["_d_slice"].data,
+            # d_sino (__read_only image2d_t or float*)
+            d_sino_ref,
+            # gpu_offset_x (float32) 
+            np.float32(self.extra_options["gpu_offset_x"]),
+            # gpu_offset_y (float32)
+            np.float32(self.extra_options["gpu_offset_y"]),
+            # d_cos (__global float32*)
+            self.cl_mem["d_cos"].data,
+            # d_sin (__global float32*)
+            self.cl_mem["d_sin"].data,
+            # d_axis  (__global float32*)
+            self.cl_mem["d_axes"].data,
+            # shared mem (__local float32*)
+            self._get_local_mem()
+        )
+
+    def _allocate_textures(self):
+        """
+        Allocate the texture for the sinogram.
+        """
+        self.d_sino_tex = self.allocate_texture(self.shape)
+
+    def _init_filter(self, filter_name):
+        """Filter initialization
+
+        :param str filter_name: filter name
+        """
+        self.filter_name = filter_name or "ram-lak"
+        self.sino_filter = SinoFilter(
+            self.shape,
+            ctx=self.ctx,
+            filter_name=self.filter_name,
+            extra_options=self.extra_options,
+        )
+
+    def _get_local_mem(self):
+        return pyopencl.LocalMemory(self.local_mem)  # constant for all image sizes
+
+    def _cpy2d_to_slice(self, dst):
+        ndrange = (int(self.slice_shape[1]), int(self.slice_shape[0]))
+        slice_shape_ocl = np.int32(ndrange)
+        wg = None
+        kernel_args = (
+            dst.data,
+            self.cl_mem["_d_slice"].data,
+            np.int32(self.slice_shape[1]),
+            np.int32(self.dimrec_shape[1]),
+            np.int32((0, 0)),
+            np.int32((0, 0)),
+            slice_shape_ocl
+        )
+        return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args)
+
+    def _transfer_to_texture(self, sino):
+        if isinstance(sino, parray.Array):
+            return self._transfer_device_to_texture(sino)
+        sino2 = sino
+        if not(sino.flags["C_CONTIGUOUS"] and sino.dtype == np.float32):
+            sino2 = np.ascontiguousarray(sino, dtype=np.float32)
+        if not(self._use_textures):
+            ev = pyopencl.enqueue_copy(
+                                        self.queue,
+                                        self.d_sino.data,
+                                        sino2
+                                        )
+            what = "transfer filtered sino H->D buffer"
+            ev.wait()
+        else:
+            ev = pyopencl.enqueue_copy(
+                                       self.queue,
+                                       self.d_sino_tex,
+                                       sino2,
+                                       origin=(0, 0),
+                                       region=self.shape[::-1]
+                                       )
+            what = "transfer filtered sino H->D texture"
+        return EventDescription(what, ev)
+
+    def _transfer_device_to_texture(self, d_sino):
+        if not(self._use_textures):
+            if id(self.d_sino) == id(d_sino):
+                return
+            ev = pyopencl.enqueue_copy(
+                                       self.queue,
+                                       self.d_sino.data,
+                                       d_sino
+                                       )
+            what = "transfer filtered sino D->D buffer"
+            ev.wait()
+        else:
+            ev = pyopencl.enqueue_copy(
+                                       self.queue,
+                                       self.d_sino_tex,
+                                       d_sino.data,
+                                       offset=0,
+                                       origin=(0, 0),
+                                       region=self.shape[::-1]
+                                       )
+            what = "transfer filtered sino D->D texture"
+        return EventDescription(what, ev)
+
+    def backprojection(self, sino, output=None):
+        """Perform the backprojection on an input sinogram
+
+        :param sino: sinogram.
+        :param output: optional, output slice.
+            If provided, the result will be written in this array.
+        :return: backprojection of sinogram
+        """
+        events = []
+        with self.sem:
+            events.append(self._transfer_to_texture(sino))
+            # Call the backprojection kernel
+            if not(self._use_textures):
+                kernel_to_call = self.kernels.backproj_cpu_kernel
+            else:
+                kernel_to_call = self.kernels.backproj_kernel
+            kernel_to_call(
+                self.queue,
+                self.ndrange,
+                self.wg,
+                *self._backproj_kernel_args
+            )
+            # Return
+            if output is None:
+                res = self.cl_mem["_d_slice"].get()
+                res = res[:self.slice_shape[0], :self.slice_shape[1]]
+            else:
+                res = output
+                self._cpy2d_to_slice(output)
+
+        # /with self.sem
+        if self.profile:
+            self.events += events
+
+        return res
+
+    def filtered_backprojection(self, sino, output=None):
+        """
+        Compute the filtered backprojection (FBP) on a sinogram.
+
+        :param sino: sinogram (`np.ndarray` or `pyopencl.array.Array`)
+            with the shape (n_projections, n_bins)
+        :param output: output (`np.ndarray` or `pyopencl.array.Array`).
+            If nothing is provided, a new numpy array is returned.
+        """
+        # Filter
+        self.sino_filter(sino, output=self.d_sino)
+        # Backproject
+        res = self.backprojection(self.d_sino, output=output)
+        return res
+
+    __call__ = filtered_backprojection
+
+
+    # -------------------
+    # - Compatibility  -
+    # -------------------
+
+    @deprecated(replacement="Backprojection.sino_filter", since_version="0.10")
+    def filter_projections(self, sino, rescale=True):
+        self.sino_filter(sino, output=self.d_sino)
+
+
+
+def fourier_filter(sino, filter_=None, fft_size=None):
+    return fourier_filter_(sino, filter_=filter_, fft_size=fft_size)
+
diff --git a/src/silx/opencl/codec/__init__.py b/src/silx/opencl/codec/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/silx/opencl/codec/__init__.py
diff --git a/src/silx/opencl/codec/byte_offset.py b/src/silx/opencl/codec/byte_offset.py
new file mode 100644
index 0000000..9a52427
--- /dev/null
+++ b/src/silx/opencl/codec/byte_offset.py
@@ -0,0 +1,439 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Sift implementation in Python + OpenCL
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2013-2020  European Synchrotron Radiation Facility, Grenoble, France
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+This module provides a class for CBF byte offset compression/decompression.
+"""
+
+from __future__ import division, print_function, with_statement
+
+__authors__ = ["Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "11/10/2018"
+__status__ = "production"
+
+
+import functools
+import os
+import numpy
+from ..common import ocl, pyopencl
+from ..processing import BufferDescription, EventDescription, OpenclProcessing
+
+import logging
+logger = logging.getLogger(__name__)
+
+if pyopencl:
+    import pyopencl.version
+    if pyopencl.version.VERSION < (2016, 0):
+        from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel
+    else:
+        from pyopencl.algorithm import GenericScanKernel
+        from pyopencl.scan import GenericDebugScanKernel
+else:
+    logger.warning("No PyOpenCL, no byte-offset, please see fabio")
+
+
+class ByteOffset(OpenclProcessing):
+    """Perform the byte offset compression/decompression on the GPU
+
+        See :class:`OpenclProcessing` for optional arguments description.
+
+        :param int raw_size:
+            Size of the raw stream for decompression.
+            It can be (slightly) larger than the array.
+        :param int dec_size:
+            Size of the decompression output array
+            (mandatory for decompression)
+        """
+
+    def __init__(self, raw_size=None, dec_size=None,
+                 ctx=None, devicetype="all",
+                 platformid=None, deviceid=None,
+                 block_size=None, profile=False):
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  block_size=block_size, profile=profile)
+        if self.block_size is None:
+            self.block_size = self.device.max_work_group_size
+        wg = self.block_size
+
+        buffers = [BufferDescription("counter", 1, numpy.int32, None)]
+
+        if raw_size is None:
+            self.raw_size = -1
+            self.padded_raw_size = -1
+        else:
+            self.raw_size = int(raw_size)
+            self.padded_raw_size = int((self.raw_size + wg - 1) & ~(wg - 1))
+            buffers += [
+                BufferDescription("raw", self.padded_raw_size, numpy.int8, None),
+                BufferDescription("mask", self.padded_raw_size, numpy.int32, None),
+                BufferDescription("values", self.padded_raw_size, numpy.int32, None),
+                BufferDescription("exceptions", self.padded_raw_size, numpy.int32, None)
+            ]
+
+        if dec_size is None:
+            self.dec_size = None
+        else:
+            self.dec_size = numpy.int32(dec_size)
+            buffers += [
+                BufferDescription("data_float", self.dec_size, numpy.float32, None),
+                BufferDescription("data_int", self.dec_size, numpy.int32, None)
+            ]
+
+        self.allocate_buffers(buffers, use_array=True)
+
+        self.compile_kernels([os.path.join("codec", "byte_offset")])
+        self.kernels.__setattr__("scan", self._init_double_scan())
+        self.kernels.__setattr__("compression_scan",
+                                 self._init_compression_scan())
+
+    def _init_double_scan(self):
+        """"generates a double scan on indexes and values in one operation"""
+        arguments = "__global int *value", "__global int *index"
+        int2 = pyopencl.tools.get_or_register_dtype("int2")
+        input_expr = "index[i]>0 ? (int2)(0, 0) : (int2)(value[i], 1)"
+        scan_expr = "a+b"
+        neutral = "(int2)(0,0)"
+        output_statement = "value[i] = item.s0; index[i+1] = item.s1;"
+
+        if self.block_size > 256:
+            knl = GenericScanKernel(self.ctx,
+                                    dtype=int2,
+                                    arguments=arguments,
+                                    input_expr=input_expr,
+                                    scan_expr=scan_expr,
+                                    neutral=neutral,
+                                    output_statement=output_statement)
+        else:  # MacOS on CPU
+            knl = GenericDebugScanKernel(self.ctx,
+                                         dtype=int2,
+                                         arguments=arguments,
+                                         input_expr=input_expr,
+                                         scan_expr=scan_expr,
+                                         neutral=neutral,
+                                         output_statement=output_statement)
+        return knl
+
+    def decode(self, raw, as_float=False, out=None):
+        """This function actually performs the decompression by calling the kernels
+
+        :param numpy.ndarray raw: The compressed data as a 1D numpy array of char.
+        :param bool as_float: True to decompress as float32,
+                              False (default) to decompress as int32
+        :param pyopencl.array out: pyopencl array in which to place the result.
+        :return: The decompressed image as an pyopencl array.
+        :rtype: pyopencl.array
+        """
+        assert self.dec_size is not None, \
+            "dec_size is a mandatory ByteOffset init argument for decompression"
+
+        events = []
+        with self.sem:
+            len_raw = numpy.int32(len(raw))
+            if len_raw > self.padded_raw_size:
+                wg = self.block_size
+                self.raw_size = int(len(raw))
+                self.padded_raw_size = (self.raw_size + wg - 1) & ~(wg - 1)
+                logger.info("increase raw buffer size to %s", self.padded_raw_size)
+                buffers = {
+                           "raw": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int8),
+                           "mask": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
+                           "exceptions": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
+                           "values": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
+                          }
+                self.cl_mem.update(buffers)
+            else:
+                wg = self.block_size
+
+            evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data,
+                                        raw,
+                                        is_blocking=False)
+            events.append(EventDescription("copy raw H -> D", evt))
+            evt = self.kernels.fill_int_mem(self.queue, (self.padded_raw_size,), (wg,),
+                                            self.cl_mem["mask"].data,
+                                            numpy.int32(self.padded_raw_size),
+                                            numpy.int32(0),
+                                            numpy.int32(0))
+            events.append(EventDescription("memset mask", evt))
+            evt = self.kernels.fill_int_mem(self.queue, (1,), (1,),
+                                            self.cl_mem["counter"].data,
+                                            numpy.int32(1),
+                                            numpy.int32(0),
+                                            numpy.int32(0))
+            events.append(EventDescription("memset counter", evt))
+            evt = self.kernels.mark_exceptions(self.queue, (self.padded_raw_size,), (wg,),
+                                               self.cl_mem["raw"].data,
+                                               len_raw,
+                                               numpy.int32(self.raw_size),
+                                               self.cl_mem["mask"].data,
+                                               self.cl_mem["values"].data,
+                                               self.cl_mem["counter"].data,
+                                               self.cl_mem["exceptions"].data)
+            events.append(EventDescription("mark exceptions", evt))
+            nb_exceptions = numpy.empty(1, dtype=numpy.int32)
+            evt = pyopencl.enqueue_copy(self.queue, nb_exceptions, self.cl_mem["counter"].data,
+                                        is_blocking=False)
+            events.append(EventDescription("copy counter D -> H", evt))
+            evt.wait()
+            nbexc = int(nb_exceptions[0])
+            if nbexc == 0:
+                logger.info("nbexc %i", nbexc)
+            else:
+                evt = self.kernels.treat_exceptions(self.queue, (nbexc,), (1,),
+                                                    self.cl_mem["raw"].data,
+                                                    len_raw,
+                                                    self.cl_mem["mask"].data,
+                                                    self.cl_mem["exceptions"].data,
+                                                    self.cl_mem["values"].data
+                                                    )
+                events.append(EventDescription("treat_exceptions", evt))
+
+            #self.cl_mem["copy_values"] = self.cl_mem["values"].copy()
+            #self.cl_mem["copy_mask"] = self.cl_mem["mask"].copy()
+            evt = self.kernels.scan(self.cl_mem["values"],
+                                    self.cl_mem["mask"],
+                                    queue=self.queue,
+                                    size=int(len_raw),
+                                    wait_for=(evt,))
+            events.append(EventDescription("double scan", evt))
+            #evt.wait()
+            if out is not None:
+                if out.dtype == numpy.float32:
+                    copy_results = self.kernels.copy_result_float
+                else:
+                    copy_results = self.kernels.copy_result_int
+            else:
+                if as_float:
+                    out = self.cl_mem["data_float"]
+                    copy_results = self.kernels.copy_result_float
+                else:
+                    out = self.cl_mem["data_int"]
+                    copy_results = self.kernels.copy_result_int
+            evt = copy_results(self.queue, (self.padded_raw_size,), (wg,),
+                               self.cl_mem["values"].data,
+                               self.cl_mem["mask"].data,
+                               len_raw,
+                               self.dec_size,
+                               out.data
+                               )
+            events.append(EventDescription("copy_results", evt))
+            #evt.wait()
+            if self.profile:
+                self.events += events
+        return out
+
+    __call__ = decode
+
+    def _init_compression_scan(self):
+        """Initialize CBF compression scan kernels"""
+        preamble = """
+        int compressed_size(int diff) {
+            int abs_diff = abs(diff);
+
+            if (abs_diff < 128) {
+                return 1;
+            }
+            else if (abs_diff < 32768) {
+                return 3;
+            }
+            else {
+                return 7;
+            }
+        }
+
+        void write(const int index,
+                   const int diff,
+                   global char *output) {
+            int abs_diff = abs(diff);
+
+            if (abs_diff < 128) {
+                output[index] = (char) diff;
+            }
+            else if (abs_diff < 32768) {
+                output[index] = -128;
+                output[index + 1] = (char) (diff >> 0);
+                output[index + 2] = (char) (diff >> 8);
+            }
+            else {
+                output[index] = -128;
+                output[index + 1] = 0;
+                output[index + 2] = -128;
+                output[index + 3] = (char) (diff >> 0);
+                output[index + 4] = (char) (diff >> 8);
+                output[index + 5] = (char) (diff >> 16);
+                output[index + 6] = (char) (diff >> 24);
+            }
+        }
+        """
+        arguments = "__global const int *data, __global char *compressed, __global int *size"
+        input_expr = "compressed_size((i == 0) ? data[0] : (data[i] - data[i - 1]))"
+        scan_expr = "a+b"
+        neutral = "0"
+        output_statement = """
+        if (prev_item == 0) { // 1st thread store compressed data size
+            size[0] = last_item;
+        }
+        write(prev_item, (i == 0) ? data[0] : (data[i] - data[i - 1]), compressed);
+        """
+
+        if self.block_size >= 64:
+            knl = GenericScanKernel(self.ctx,
+                                    dtype=numpy.int32,
+                                    preamble=preamble,
+                                    arguments=arguments,
+                                    input_expr=input_expr,
+                                    scan_expr=scan_expr,
+                                    neutral=neutral,
+                                    output_statement=output_statement)
+        else:  # MacOS on CPU
+            knl = GenericDebugScanKernel(self.ctx,
+                                         dtype=numpy.int32,
+                                         preamble=preamble,
+                                         arguments=arguments,
+                                         input_expr=input_expr,
+                                         scan_expr=scan_expr,
+                                         neutral=neutral,
+                                         output_statement=output_statement)
+        return knl
+
+    def encode(self, data, out=None):
+        """Compress data to CBF.
+
+        :param data: The data to compress as a numpy array
+                     (or a pyopencl Array) of int32.
+        :type data: Union[numpy.ndarray, pyopencl.array.Array]
+        :param pyopencl.array out:
+            pyopencl array of int8 in which to store the result.
+            The array should be large enough to store the compressed data.
+        :return: The compressed data as a pyopencl array.
+                 If out is provided, this array shares the backing buffer,
+                 but has the exact size of the compressed data and the queue
+                 of the ByteOffset instance.
+        :rtype: pyopencl.array
+        :raises ValueError: if out array is not large enough
+        """
+
+        events = []
+        with self.sem:
+            if isinstance(data, pyopencl.array.Array):
+                d_data = data  # Uses provided array
+
+            else:  # Copy data to device
+                data = numpy.ascontiguousarray(data, dtype=numpy.int32).ravel()
+
+                # Make sure data array exists and is large enough
+                if ("data_input" not in self.cl_mem or
+                        self.cl_mem["data_input"].size < data.size):
+                    logger.info("increase data input buffer size to %s", data.size)
+                    self.cl_mem.update({
+                        "data_input": pyopencl.array.empty(self.queue,
+                                                           data.size,
+                                                           dtype=numpy.int32)})
+                d_data = self.cl_mem["data_input"]
+
+                evt = pyopencl.enqueue_copy(
+                    self.queue, d_data.data, data, is_blocking=False)
+                events.append(EventDescription("copy data H -> D", evt))
+
+            # Make sure compressed array exists and is large enough
+            compressed_size = d_data.size * 7
+            if ("compressed" not in self.cl_mem or
+                    self.cl_mem["compressed"].size < compressed_size):
+                logger.info("increase compressed buffer size to %s", compressed_size)
+                self.cl_mem.update({
+                    "compressed": pyopencl.array.empty(self.queue,
+                                                       compressed_size,
+                                                       dtype=numpy.int8)})
+            d_compressed = self.cl_mem["compressed"]
+            d_size = self.cl_mem["counter"]  # Shared with decompression
+
+            evt = self.kernels.compression_scan(d_data, d_compressed, d_size)
+            events.append(EventDescription("compression scan", evt))
+            byte_count = int(d_size.get()[0])
+
+            if out is None:
+                # Create out array from a sub-region of the compressed buffer
+                out = pyopencl.array.Array(
+                    self.queue,
+                    shape=(byte_count,),
+                    dtype=numpy.int8,
+                    allocator=functools.partial(
+                        d_compressed.base_data.get_sub_region,
+                        d_compressed.offset))
+
+            elif out.size < byte_count:
+                raise ValueError(
+                    "Provided output buffer is not large enough: "
+                    "requires %d bytes, got %d" % (byte_count, out.size))
+
+            else:  # out.size >= byte_count
+                # Create an array with a sub-region of out and this class queue
+                out = pyopencl.array.Array(
+                    self.queue,
+                    shape=(byte_count,),
+                    dtype=numpy.int8,
+                    allocator=functools.partial(out.base_data.get_sub_region,
+                                                out.offset))
+
+                evt = pyopencl.enqueue_copy(self.queue, out.data, d_compressed.data,
+                                            byte_count=byte_count)
+                events.append(
+                    EventDescription("copy D -> D: internal -> out", evt))
+
+            if self.profile:
+                self.events += events
+
+        return out
+
+    def encode_to_bytes(self, data):
+        """Compresses data to CBF and returns compressed data as bytes.
+
+        Usage:
+
+        Provided an image (`image`) stored as a numpy array of int32,
+        first, create a byte offset compression/decompression object:
+
+        >>> from silx.opencl.codec.byte_offset import ByteOffset
+        >>> byte_offset_codec = ByteOffset()
+
+        Then, compress an image into bytes:
+
+        >>> compressed = byte_offset_codec.encode_to_bytes(image)
+
+        :param data: The data to compress as a numpy array
+                     (or a pyopencl Array) of int32.
+        :type data: Union[numpy.ndarray, pyopencl.array.Array]
+        :return: The compressed data as bytes.
+        :rtype: bytes
+        """
+        compressed_array = self.encode(data)
+        return compressed_array.get().tobytes()
diff --git a/src/silx/opencl/codec/setup.py b/src/silx/opencl/codec/setup.py
new file mode 100644
index 0000000..4a5c1e5
--- /dev/null
+++ b/src/silx/opencl/codec/setup.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+#
+# Copyright (C) 2016-2017 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+
+from __future__ import division
+
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__authors__ = ["J. Kieffer"]
+__date__ = "13/10/2017"
+
+from numpy.distutils.misc_util import Configuration
+
+
+def configuration(parent_package='', top_path=None):
+    config = Configuration('codec', parent_package, top_path)
+    config.add_subpackage('test')
+    return config
+
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/src/silx/opencl/codec/test/__init__.py b/src/silx/opencl/codec/test/__init__.py
new file mode 100644
index 0000000..325c2c7
--- /dev/null
+++ b/src/silx/opencl/codec/test/__init__.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: silx
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2013-2017  European Synchrotron Radiation Facility, Grenoble, France
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
diff --git a/src/silx/opencl/codec/test/test_byte_offset.py b/src/silx/opencl/codec/test/test_byte_offset.py
new file mode 100644
index 0000000..4b2d5a3
--- /dev/null
+++ b/src/silx/opencl/codec/test/test_byte_offset.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Byte-offset decompression in OpenCL
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2013-2020  European Synchrotron Radiation Facility,
+#                             Grenoble, France
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Test suite for byte-offset decompression 
+"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "02/03/2021"
+
+import sys
+import time
+import logging
+import numpy
+from silx.opencl.common import ocl, pyopencl
+from silx.opencl.codec import byte_offset
+import fabio
+import unittest
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipUnless(ocl and pyopencl,
+                     "PyOpenCl is missing")
+class TestByteOffset(unittest.TestCase):
+
+    @staticmethod
+    def _create_test_data(shape, nexcept, lam=200):
+        """Create test (image, compressed stream) pair.
+
+        :param shape: Shape of test image
+        :param int nexcept: Number of exceptions in the image
+        :param lam: Expectation of interval argument for numpy.random.poisson
+        :return: (reference image array, compressed stream)
+        """
+        size = numpy.prod(shape)
+        ref = numpy.random.poisson(lam, numpy.prod(shape))
+        exception_loc = numpy.random.randint(0, size, size=nexcept)
+        exception_value = numpy.random.randint(0, 1000000, size=nexcept)
+        ref[exception_loc] = exception_value
+        ref.shape = shape
+
+        raw = fabio.compression.compByteOffset(ref)
+        return ref, raw
+
+    def test_decompress(self):
+        """
+        tests the byte offset decompression on GPU
+        """
+        ref, raw = self._create_test_data(shape=(91, 97), nexcept=229)
+        # ref, raw = self._create_test_data(shape=(7, 9), nexcept=0)
+
+        size = numpy.prod(ref.shape)
+
+        try:
+            bo = byte_offset.ByteOffset(raw_size=len(raw), dec_size=size, profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            if sys.platform == "darwin":
+                raise unittest.SkipTest("Byte-offset decompression is known to be buggy on MacOS-CPU")
+            else:
+                raise err
+        print(bo.block_size)
+
+        t0 = time.time()
+        res_cy = fabio.compression.decByteOffset(raw)
+        t1 = time.time()
+        res_cl = bo.decode(raw)
+        t2 = time.time()
+        delta_cy = abs(ref.ravel() - res_cy).max()
+        delta_cl = abs(ref.ravel() - res_cl.get()).max()
+
+        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0),
+                     1000.0 * (t2 - t1))
+        bo.log_profile()
+        # print(ref)
+        # print(res_cl.get())
+        self.assertEqual(delta_cy, 0, "Checks fabio works")
+        self.assertEqual(delta_cl, 0, "Checks opencl works")
+
+    def test_many_decompress(self, ntest=10):
+        """
+        tests the byte offset decompression on GPU, many images to ensure there 
+        is not leaking in memory 
+        """
+        shape = (991, 997)
+        size = numpy.prod(shape)
+        ref, raw = self._create_test_data(shape=shape, nexcept=0, lam=100)
+
+        try:
+            bo = byte_offset.ByteOffset(len(raw), size, profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            if sys.platform == "darwin":
+                raise unittest.SkipTest("Byte-offset decompression is known to be buggy on MacOS-CPU")
+            else:
+                raise err
+        t0 = time.time()
+        res_cy = fabio.compression.decByteOffset(raw)
+        t1 = time.time()
+        res_cl = bo(raw)
+        t2 = time.time()
+        delta_cy = abs(ref.ravel() - res_cy).max()
+        delta_cl = abs(ref.ravel() - res_cl.get()).max()
+        self.assertEqual(delta_cy, 0, "Checks fabio works")
+        self.assertEqual(delta_cl, 0, "Checks opencl works")
+        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0),
+                     1000.0 * (t2 - t1))
+
+        for i in range(ntest):
+            ref, raw = self._create_test_data(shape=shape, nexcept=2729, lam=200)
+
+            t0 = time.time()
+            res_cy = fabio.compression.decByteOffset(raw)
+            t1 = time.time()
+            res_cl = bo(raw)
+            t2 = time.time()
+            delta_cy = abs(ref.ravel() - res_cy).max()
+            delta_cl = abs(ref.ravel() - res_cl.get()).max()
+            self.assertEqual(delta_cy, 0, "Checks fabio works #%i" % i)
+            self.assertEqual(delta_cl, 0, "Checks opencl works #%i" % i)
+
+            logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                         1000.0 * (t1 - t0),
+                         1000.0 * (t2 - t1))
+        bo.log_profile(stats=True)
+
+    def test_encode(self):
+        """Test byte offset compression"""
+        ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729)
+
+        try:
+            bo = byte_offset.ByteOffset(len(raw), ref.size, profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            raise err
+
+        t0 = time.time()
+        compressed_array = bo.encode(ref)
+        t1 = time.time()
+
+        compressed_stream = compressed_array.get().tobytes()
+        self.assertEqual(raw, compressed_stream)
+
+        logger.debug("Global execution time: OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0))
+        bo.log_profile()
+
+    def test_encode_to_array(self):
+        """Test byte offset compression while providing an out array"""
+
+        ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729)
+
+        try:
+            bo = byte_offset.ByteOffset(profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            raise err
+        # Test with out buffer too small
+        out = pyopencl.array.empty(bo.queue, (10,), numpy.int8)
+        with self.assertRaises(ValueError):
+            bo.encode(ref, out)
+
+        # Test with out buffer too big
+        out = pyopencl.array.empty(bo.queue, (len(raw) + 10,), numpy.int8)
+
+        compressed_array = bo.encode(ref, out)
+
+        # Get size from returned array
+        compressed_size = compressed_array.size
+        self.assertEqual(compressed_size, len(raw))
+
+        # Get data from out array, read it from bo object queue
+        out_bo_queue = out.with_queue(bo.queue)
+        compressed_stream = out_bo_queue.get().tobytes()[:compressed_size]
+        self.assertEqual(raw, compressed_stream)
+
+    def test_encode_to_bytes(self):
+        """Test byte offset compression to bytes"""
+        ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729)
+
+        try:
+            bo = byte_offset.ByteOffset(profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            raise err
+
+        t0 = time.time()
+        res_fabio = fabio.compression.compByteOffset(ref)
+        t1 = time.time()
+        compressed_stream = bo.encode_to_bytes(ref)
+        t2 = time.time()
+
+        self.assertEqual(raw, compressed_stream)
+
+        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0),
+                     1000.0 * (t2 - t1))
+        bo.log_profile()
+
+    def test_encode_to_bytes_from_array(self):
+        """Test byte offset compression to bytes from a pyopencl array.
+        """
+        ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729)
+
+        try:
+            bo = byte_offset.ByteOffset(profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            raise err
+
+        d_ref = pyopencl.array.to_device(
+            bo.queue, ref.astype(numpy.int32).ravel())
+
+        t0 = time.time()
+        res_fabio = fabio.compression.compByteOffset(ref)
+        t1 = time.time()
+        compressed_stream = bo.encode_to_bytes(d_ref)
+        t2 = time.time()
+
+        self.assertEqual(raw, compressed_stream)
+
+        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0),
+                     1000.0 * (t2 - t1))
+        bo.log_profile()
+
+    def test_many_encode(self, ntest=10):
+        """Test byte offset compression with many image"""
+        shape = (991, 997)
+        ref, raw = self._create_test_data(shape=shape, nexcept=0, lam=100)
+
+        try:
+            bo = byte_offset.ByteOffset(profile=False)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            raise err
+
+        bo_durations = []
+
+        t0 = time.time()
+        res_fabio = fabio.compression.compByteOffset(ref)
+        t1 = time.time()
+        compressed_stream = bo.encode_to_bytes(ref)
+        t2 = time.time()
+        bo_durations.append(1000.0 * (t2 - t1))
+
+        self.assertEqual(raw, compressed_stream)
+        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0),
+                     1000.0 * (t2 - t1))
+
+        for i in range(ntest):
+            ref, raw = self._create_test_data(shape=shape, nexcept=2729, lam=200)
+
+            t0 = time.time()
+            res_fabio = fabio.compression.compByteOffset(ref)
+            t1 = time.time()
+            compressed_stream = bo.encode_to_bytes(ref)
+            t2 = time.time()
+            bo_durations.append(1000.0 * (t2 - t1))
+
+            self.assertEqual(raw, compressed_stream)
+            logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                         1000.0 * (t1 - t0),
+                         1000.0 * (t2 - t1))
+
+        logger.debug("OpenCL execution time: Mean: %fms, Min: %fms, Max: %fms",
+                     numpy.mean(bo_durations),
+                     numpy.min(bo_durations),
+                     numpy.max(bo_durations))
diff --git a/src/silx/opencl/common.py b/src/silx/opencl/common.py
new file mode 100644
index 0000000..888b1da
--- /dev/null
+++ b/src/silx/opencl/common.py
@@ -0,0 +1,694 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: S I L X project
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2021 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+__author__ = "Jerome Kieffer"
+__contact__ = "Jerome.Kieffer@ESRF.eu"
+__license__ = "MIT"
+__copyright__ = "2012-2017 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "29/09/2021"
+__status__ = "stable"
+__all__ = ["ocl", "pyopencl", "mf", "release_cl_buffers", "allocate_cl_buffers",
+           "measure_workgroup_size", "kernel_workgroup_size"]
+
+import os
+import logging
+
+import numpy
+
+from .utils import get_opencl_code
+
+logger = logging.getLogger(__name__)
+
+if os.environ.get("SILX_OPENCL") in ["0", "False"]:
+    logger.info("Use of OpenCL has been disabled from environment variable: SILX_OPENCL=0")
+    pyopencl = None
+else:
+    try:
+        import pyopencl
+    except ImportError:
+        logger.warning("Unable to import pyOpenCl. Please install it from: https://pypi.org/project/pyopencl")
+        pyopencl = None
+    else:
+        try:
+            pyopencl.get_platforms()
+        except pyopencl.LogicError:
+            logger.warning("The module pyOpenCL has been imported but can't be used here")
+            pyopencl = None
+        else:
+            import pyopencl.array as array
+            mf = pyopencl.mem_flags
+
+if pyopencl is None:
+
+    # Define default mem flags
+    class mf(object):
+        WRITE_ONLY = 1
+        READ_ONLY = 1
+        READ_WRITE = 1
+else:
+    mf = pyopencl.mem_flags
+
+FLOP_PER_CORE = {"GPU": 64,  # GPU, Fermi at least perform 64 flops per cycle/multicore, G80 were at 24 or 48 ...
+                 "CPU": 4,  # CPU, at least intel's have 4 operation per cycle
+                 "ACC": 8}  # ACC: the Xeon-phi (MIC) appears to be able to process 8 Flops per hyperthreaded-core
+
+# Sources : https://en.wikipedia.org/wiki/CUDA
+NVIDIA_FLOP_PER_CORE = {(1, 0): 24,   # Guessed !
+                        (1, 1): 24,   # Measured on G98 [Quadro NVS 295]
+                        (1, 2): 24,   # Guessed !
+                        (1, 3): 24,   # measured on a GT285 (GT200)
+                        (2, 0): 64,   # Measured on a 580 (GF110)
+                        (2, 1): 96,   # Measured on Quadro2000 GF106GL
+                        (3, 0): 384,  # Guessed!
+                        (3, 5): 384,  # Measured on K20
+                        (3, 7): 384,  # K80: Guessed!
+                        (5, 0): 256,  # Maxwell 4 warps/SM 2 flops/ CU
+                        (5, 2): 256,  # Titan-X
+                        (5, 3): 256,  # TX1
+                        (6, 0): 128,  # GP100
+                        (6, 1): 128,  # GP104
+                        (6, 2): 128,  # ?
+                        (7, 0): 128,  # Volta  # measured on Telsa V100
+                        (7, 2): 128,  # Volta  ? 
+                        (7, 5): 128,  # Turing # measured on RTX 6000
+                        (8, 0): 128,  # Ampere # measured on Tesla A100
+                        (8, 6): 256,  # Ampere # measured on RTX A5000
+                        }
+
+AMD_FLOP_PER_CORE = 160  # Measured on a M7820 10 core, 700MHz 1120GFlops
+
+
+class Device(object):
+    """
+    Simple class that contains the structure of an OpenCL device
+    """
+
+    def __init__(self, name="None", dtype=None, version=None, driver_version=None,
+                 extensions="", memory=None, available=None,
+                 cores=None, frequency=None, flop_core=None, idx=0, workgroup=1):
+        """
+        Simple container with some important data for the OpenCL device description.
+
+        :param name: name of the device
+        :param dtype: device type: CPU/GPU/ACC...
+        :param version: driver version
+        :param driver_version:
+        :param extensions: List of opencl extensions
+        :param memory: maximum memory available on the device
+        :param available: is the device deactivated or not
+        :param cores: number of SM/cores
+        :param frequency: frequency of the device
+        :param flop_core: Flopating Point operation per core per cycle
+        :param idx: index of the device within the platform
+        :param workgroup: max workgroup size
+        """
+        self.name = name.strip()
+        self.type = dtype
+        self.version = version
+        self.driver_version = driver_version
+        self.extensions = extensions.split()
+        self.memory = memory
+        self.available = available
+        self.cores = cores
+        self.frequency = frequency
+        self.id = idx
+        self.max_work_group_size = workgroup
+        if not flop_core:
+            flop_core = FLOP_PER_CORE.get(dtype, 1)
+        if cores and frequency:
+            self.flops = cores * frequency * flop_core
+        else:
+            self.flops = flop_core
+
+    def __repr__(self):
+        return "%s" % self.name
+
+    def pretty_print(self):
+        """
+        Complete device description
+
+        :return: string
+        """
+        lst = ["Name\t\t:\t%s" % self.name,
+               "Type\t\t:\t%s" % self.type,
+               "Memory\t\t:\t%.3f MB" % (self.memory / 2.0 ** 20),
+               "Cores\t\t:\t%s CU" % self.cores,
+               "Frequency\t:\t%s MHz" % self.frequency,
+               "Speed\t\t:\t%.3f GFLOPS" % (self.flops / 1000.),
+               "Version\t\t:\t%s" % self.version,
+               "Available\t:\t%s" % self.available]
+        return os.linesep.join(lst)
+
+    def set_unavailable(self):
+        """Use this method to flag a faulty device
+        """
+        self.available = False
+
+
+class Platform(object):
+    """
+    Simple class that contains the structure of an OpenCL platform
+    """
+
+    def __init__(self, name="None", vendor="None", version=None, extensions=None, idx=0):
+        """
+        Class containing all descriptions of a platform and all devices description within that platform.
+
+        :param name: platform name
+        :param vendor: name of the brand/vendor
+        :param version:
+        :param extensions: list of the extension provided by the platform to all of its devices
+        :param idx: index of the platform
+        """
+        self.name = name.strip()
+        self.vendor = vendor.strip()
+        self.version = version
+        self.extensions = extensions.split()
+        self.devices = []
+        self.id = idx
+
+    def __repr__(self):
+        return "%s" % self.name
+
+    def add_device(self, device):
+        """
+        Add new device to the platform
+
+        :param device: Device instance
+        """
+        self.devices.append(device)
+
+    def get_device(self, key):
+        """
+        Return a device according to key
+
+        :param key: identifier for a device, either it's id (int) or it's name
+        :type key: int or str
+        """
+        out = None
+        try:
+            devid = int(key)
+        except ValueError:
+            for a_dev in self.devices:
+                if a_dev.name == key:
+                    out = a_dev
+        else:
+            if len(self.devices) > devid > 0:
+                out = self.devices[devid]
+        return out
+
+
+def _measure_workgroup_size(device_or_context, fast=False):
+    """Mesure the maximal work group size of the given device
+
+    DEPRECATED since not perfectly correct !
+
+    :param device_or_context: instance of pyopencl.Device or pyopencl.Context
+                    or 2-tuple (platformid,deviceid)
+    :param fast: ask the kernel the valid value, don't probe it
+    :return: maximum size for the workgroup
+    """
+    if isinstance(device_or_context, pyopencl.Device):
+        try:
+            ctx = pyopencl.Context(devices=[device_or_context])
+        except pyopencl._cl.LogicError as error:
+            platform = device_or_context.platform
+            platformid = pyopencl.get_platforms().index(platform)
+            deviceid = platform.get_devices().index(device_or_context)
+            ocl.platforms[platformid].devices[deviceid].set_unavailable()
+            raise RuntimeError("Unable to create context on %s/%s: %s" % (platform, device_or_context, error))
+        else:
+            device = device_or_context
+    elif isinstance(device_or_context, pyopencl.Context):
+        ctx = device_or_context
+        device = device_or_context.devices[0]
+    elif isinstance(device_or_context, (tuple, list)) and len(device_or_context) == 2:
+        ctx = ocl.create_context(platformid=device_or_context[0],
+                                 deviceid=device_or_context[1])
+        device = ctx.devices[0]
+    else:
+        raise RuntimeError("""given parameter device_or_context is not an
+            instanciation of a device or a context""")
+    shape = device.max_work_group_size
+    # get the context
+
+    assert ctx is not None
+    queue = pyopencl.CommandQueue(ctx)
+
+    max_valid_wg = 1
+    data = numpy.random.random(shape).astype(numpy.float32)
+    d_data = pyopencl.array.to_device(queue, data)
+    d_data_1 = pyopencl.array.empty_like(d_data)
+    d_data_1.fill(numpy.float32(1.0))
+
+    program = pyopencl.Program(ctx, get_opencl_code("addition")).build()
+    if fast:
+        max_valid_wg = program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, device)
+    else:
+        maxi = int(round(numpy.log2(shape)))
+        for i in range(maxi + 1):
+            d_res = pyopencl.array.empty_like(d_data)
+            wg = 1 << i
+            try:
+                evt = program.addition(
+                    queue, (shape,), (wg,),
+                    d_data.data, d_data_1.data, d_res.data, numpy.int32(shape))
+                evt.wait()
+            except Exception as error:
+                logger.info("%s on device %s for WG=%s/%s", error, device.name, wg, shape)
+                program = queue = d_res = d_data_1 = d_data = None
+                break
+            else:
+                res = d_res.get()
+                good = numpy.allclose(res, data + 1)
+                if good:
+                    if wg > max_valid_wg:
+                        max_valid_wg = wg
+                else:
+                    logger.warning("ArithmeticError on %s for WG=%s/%s", wg, device.name, shape)
+
+    return max_valid_wg
+
+
+def _is_nvidia_gpu(vendor, devtype):
+    return (vendor == "NVIDIA Corporation") and (devtype == "GPU")
+
+
+class OpenCL(object):
+    """
+    Simple class that wraps the structure ocl_tools_extended.h
+
+    This is a static class.
+    ocl should be the only instance and shared among all python modules.
+    """
+
+    platforms = []
+    nb_devices = 0
+    context_cache = {}  # key: 2-tuple of int, value: context
+    if pyopencl:
+        platform = device = pypl = devtype = extensions = pydev = None
+        for idx, platform in enumerate(pyopencl.get_platforms()):
+            pypl = Platform(platform.name, platform.vendor, platform.version, platform.extensions, idx)
+            for idd, device in enumerate(platform.get_devices()):
+                ####################################################
+                # Nvidia does not report int64 atomics (we are using) ...
+                # this is a hack around as any nvidia GPU with double-precision supports int64 atomics
+                ####################################################
+                extensions = device.extensions
+                if (pypl.vendor == "NVIDIA Corporation") and ('cl_khr_fp64' in extensions):
+                    extensions += ' cl_khr_int64_base_atomics cl_khr_int64_extended_atomics'
+                try:
+                    devtype = pyopencl.device_type.to_string(device.type).upper()
+                except ValueError:
+                    # pocl does not describe itself as a CPU !
+                    devtype = "CPU"
+                if len(devtype) > 3:
+                    if "GPU" in devtype:
+                        devtype = "GPU"
+                    elif "ACC" in devtype:
+                        devtype = "ACC"
+                    elif "CPU" in devtype:
+                        devtype = "CPU"
+                    else:
+                        devtype = devtype[:3]
+                if _is_nvidia_gpu(device.vendor, devtype) and ("compute_capability_major_nv" in dir(device)):
+                    try:
+                        comput_cap = device.compute_capability_major_nv, device.compute_capability_minor_nv
+                    except pyopencl.LogicError:
+                        flop_core = FLOP_PER_CORE["GPU"]
+                    else:
+                        flop_core = NVIDIA_FLOP_PER_CORE.get(comput_cap, FLOP_PER_CORE["GPU"])
+                elif (pypl.vendor == "Advanced Micro Devices, Inc.") and (devtype == "GPU"):
+                    flop_core = AMD_FLOP_PER_CORE
+                elif devtype == "CPU":
+                    flop_core = FLOP_PER_CORE.get(devtype, 1)
+                else:
+                    flop_core = 1
+                workgroup = device.max_work_group_size
+                if (devtype == "CPU") and (pypl.vendor == "Apple"):
+                    logger.info("For Apple's OpenCL on CPU: Measuring actual valid max_work_goup_size.")
+                    workgroup = _measure_workgroup_size(device, fast=True)
+                if (devtype == "GPU") and os.environ.get("GPU") == "False":
+                    # Environment variable to disable GPU devices
+                    continue
+                pydev = Device(device.name, devtype, device.version, device.driver_version, extensions,
+                               device.global_mem_size, bool(device.available), device.max_compute_units,
+                               device.max_clock_frequency, flop_core, idd, workgroup)
+                pypl.add_device(pydev)
+                nb_devices += 1
+            platforms.append(pypl)
+        del platform, device, pypl, devtype, extensions, pydev
+
+    def __repr__(self):
+        out = ["OpenCL devices:"]
+        for platformid, platform in enumerate(self.platforms):
+            deviceids = ["(%s,%s) %s" % (platformid, deviceid, dev.name)
+                         for deviceid, dev in enumerate(platform.devices)]
+            out.append("[%s] %s: " % (platformid, platform.name) + ", ".join(deviceids))
+        return os.linesep.join(out)
+
+    def get_platform(self, key):
+        """
+        Return a platform according
+
+        :param key: identifier for a platform, either an Id (int) or it's name
+        :type key: int or str
+        """
+        out = None
+        try:
+            platid = int(key)
+        except ValueError:
+            for a_plat in self.platforms:
+                if a_plat.name == key:
+                    out = a_plat
+        else:
+            if len(self.platforms) > platid > 0:
+                out = self.platforms[platid]
+        return out
+
+    def select_device(self, dtype="ALL", memory=None, extensions=None, best=True, **kwargs):
+        """
+        Select a device based on few parameters (at the end, keep the one with most memory)
+
+        :param dtype: "gpu" or "cpu" or "all" ....
+        :param memory: minimum amount of memory (int)
+        :param extensions: list of extensions to be present
+        :param best: shall we look for the
+        :returns: A tuple of plateform ID and device ID, else None if nothing
+            found
+        """
+        if extensions is None:
+            extensions = []
+        if "type" in kwargs:
+            dtype = kwargs["type"].upper()
+        else:
+            dtype = dtype.upper()
+        if len(dtype) > 3:
+            dtype = dtype[:3]
+        best_found = None
+        for platformid, platform in enumerate(self.platforms):
+            for deviceid, device in enumerate(platform.devices):
+                if not device.available:
+                    continue
+                if (dtype in ["ALL", "DEF"]) or (device.type == dtype):
+                    if (memory is None) or (memory <= device.memory):
+                        found = True
+                        for ext in extensions:
+                            if ext not in device.extensions:
+                                found = False
+                        if found:
+                            if not best:
+                                return platformid, deviceid
+                            else:
+                                if not best_found:
+                                    best_found = platformid, deviceid, device.flops
+                                elif best_found[2] < device.flops:
+                                    best_found = platformid, deviceid, device.flops
+        if best_found:
+            return best_found[0], best_found[1]
+
+        # Nothing found
+        return None
+
+    def create_context(self, devicetype="ALL", useFp64=False, platformid=None,
+                       deviceid=None, cached=True, memory=None, extensions=None):
+        """
+        Choose a device and initiate a context.
+
+        Devicetypes can be GPU,gpu,CPU,cpu,DEF,ACC,ALL.
+        Suggested are GPU,CPU.
+        For each setting to work there must be such an OpenCL device and properly installed.
+        E.g.: If Nvidia driver is installed, GPU will succeed but CPU will fail.
+              The AMD SDK kit is required for CPU via OpenCL.
+        :param devicetype: string in ["cpu","gpu", "all", "acc"]
+        :param useFp64: boolean specifying if double precision will be used: deprecated use extensions=["cl_khr_fp64"]
+        :param platformid: integer
+        :param deviceid: integer
+        :param cached: True if we want to cache the context
+        :param memory: minimum amount of memory of the device
+        :param extensions: list of extensions to be present
+        :return: OpenCL context on the selected device
+        """
+        if extensions is None:
+            extensions = []
+        if useFp64:
+            logger.warning("Deprecation: please select your device using the extension name!, i.e. extensions=['cl_khr_fp64']")
+            extensions.append('cl_khr_fp64')
+
+        if (platformid is not None) and (deviceid is not None):
+            platformid = int(platformid)
+            deviceid = int(deviceid)
+        elif "PYOPENCL_CTX" in os.environ:
+            pyopencl_ctx = [int(i) if i.isdigit() else 0 for i in os.environ["PYOPENCL_CTX"].split(":")]
+            pyopencl_ctx += [0] * (2 - len(pyopencl_ctx))  # pad with 0
+            platformid, deviceid = pyopencl_ctx
+        else:
+            ids = ocl.select_device(type=devicetype, extensions=extensions)
+            if ids:
+                platformid, deviceid = ids
+        ctx = None
+        if (platformid is not None) and (deviceid is not None):
+            if (platformid, deviceid) in self.context_cache:
+                ctx = self.context_cache[(platformid, deviceid)]
+            else:
+                try:
+                    ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
+                except pyopencl._cl.LogicError as error:
+                    self.platforms[platformid].devices[deviceid].set_unavailable()
+                    logger.warning("Unable to create context on %s/%s: %s", platformid, deviceid, error)
+                    ctx = None
+                else:
+                    if cached:
+                        self.context_cache[(platformid, deviceid)] = ctx
+        if ctx is None:
+            logger.warning("Last chance to get an OpenCL device ... probably not the one requested")
+            ctx = pyopencl.create_some_context(interactive=False)
+        return ctx
+
+    def device_from_context(self, context):
+        """
+        Retrieves the Device from the context
+
+        :param context: OpenCL context
+        :return: instance of Device
+        """
+        odevice = context.devices[0]
+        oplat = odevice.platform
+        device_id = oplat.get_devices().index(odevice)
+        platform_id = pyopencl.get_platforms().index(oplat)
+        return self.platforms[platform_id].devices[device_id]
+
+
+if pyopencl:
+    ocl = OpenCL()
+    if ocl.nb_devices == 0:
+        ocl = None
+else:
+    ocl = None
+
+
+def release_cl_buffers(cl_buffers):
+    """
+    :param cl_buffers: the buffer you want to release
+    :type cl_buffers: dict(str, pyopencl.Buffer)
+
+    This method release the memory of the buffers store in the dict
+    """
+    for key, buffer_ in cl_buffers.items():
+        if buffer_ is not None:
+            if isinstance(buffer_, pyopencl.array.Array):
+                try:
+                    buffer_.data.release()
+                except pyopencl.LogicError:
+                    logger.error("Error while freeing buffer %s", key)
+            else:
+                try:
+                    buffer_.release()
+                except pyopencl.LogicError:
+                    logger.error("Error while freeing buffer %s", key)
+            cl_buffers[key] = None
+    return cl_buffers
+
+
+def allocate_cl_buffers(buffers, device=None, context=None):
+    """
+    :param buffers: the buffers info use to create the pyopencl.Buffer
+    :type buffers: list(std, flag, numpy.dtype, int)
+    :param device: one of the context device
+    :param context: opencl contextdevice
+    :return: a dict containing the instanciated pyopencl.Buffer
+    :rtype: dict(str, pyopencl.Buffer)
+
+    This method instanciate the pyopencl.Buffer from the buffers
+    description.
+    """
+    mem = {}
+    if device is None:
+        device = ocl.device_from_context(context)
+
+    # check if enough memory is available on the device
+    ualloc = 0
+    for _, _, dtype, size in buffers:
+        ualloc += numpy.dtype(dtype).itemsize * size
+    memory = device.memory
+    logger.info("%.3fMB are needed on device which has %.3fMB",
+                ualloc / 1.0e6, memory / 1.0e6)
+    if ualloc >= memory:
+        memError = "Fatal error in allocate_buffers."
+        memError += "Not enough device memory for buffers"
+        memError += "(%lu requested, %lu available)" % (ualloc, memory)
+        raise MemoryError(memError)  # noqa
+
+    # do the allocation
+    try:
+        for name, flag, dtype, size in buffers:
+            mem[name] = pyopencl.Buffer(context, flag,
+                                        numpy.dtype(dtype).itemsize * size)
+    except pyopencl.MemoryError as error:
+        release_cl_buffers(mem)
+        raise MemoryError(error)
+
+    return mem
+
+
+def allocate_texture(ctx, shape, hostbuf=None, support_1D=False):
+    """
+    Allocate an OpenCL image ("texture").
+
+    :param ctx: OpenCL context
+    :param shape: Shape of the image. Note that pyopencl and OpenCL < 1.2
+        do not support 1D images, so 1D images are handled as 2D with one row
+    :param support_1D: force the image to be 1D if the shape has only one dim
+    """
+    if len(shape) == 1 and not(support_1D):
+        shape = (1,) + shape
+    return pyopencl.Image(
+        ctx,
+        pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.USE_HOST_PTR,
+        pyopencl.ImageFormat(
+            pyopencl.channel_order.INTENSITY,
+            pyopencl.channel_type.FLOAT
+        ),
+        hostbuf=numpy.zeros(shape[::-1], dtype=numpy.float32)
+    )
+
+
+def check_textures_availability(ctx):
+    """
+    Check whether textures are supported on the current OpenCL context.
+
+    :param ctx: OpenCL context
+    """
+    try:
+        dummy_texture = allocate_texture(ctx, (16, 16))
+        # Need to further access some attributes (pocl)
+        dummy_height = dummy_texture.height
+        textures_available = True
+        del dummy_texture, dummy_height
+    except (pyopencl.RuntimeError, pyopencl.LogicError):
+        textures_available = False
+    # Nvidia Fermi GPUs (compute capability 2.X) do not support opencl read_imagef
+    # There is no way to detect this until a kernel is compiled
+    try:
+        cc = ctx.devices[0].compute_capability_major_nv
+        textures_available &= (cc >= 3)
+    except (pyopencl.LogicError, AttributeError):  # probably not a Nvidia GPU
+        pass
+    #
+    return textures_available
+
+
+def measure_workgroup_size(device):
+    """Measure the actual size of the workgroup
+
+    :param device: device or context or 2-tuple with indexes
+    :return: the actual measured workgroup size
+
+    if device is "all", returns a dict with all devices with their ids as keys.
+    """
+    if (ocl is None) or (device is None):
+        return None
+
+    if isinstance(device, tuple) and (len(device) == 2):
+        # this is probably a tuple (platformid, deviceid)
+        device = ocl.create_context(platformid=device[0], deviceid=device[1])
+
+    if device == "all":
+        res = {}
+        for pid, platform in enumerate(ocl.platforms):
+            for did, _devices in enumerate(platform.devices):
+                tup = (pid, did)
+                res[tup] = measure_workgroup_size(tup)
+    else:
+        res = _measure_workgroup_size(device)
+    return res
+
+
+def query_kernel_info(program, kernel, what="WORK_GROUP_SIZE"):
+    """Extract the compile time information from a kernel
+
+    :param program: OpenCL program
+    :param kernel: kernel or name of the kernel
+    :param what: what is the query about ?
+    :return: int or 3-int for the workgroup size.
+    
+    Possible information available are:
+    * 'COMPILE_WORK_GROUP_SIZE': Returns the work-group size specified inside the kernel (__attribute__((reqd_work_gr oup_size(X, Y, Z))))
+    * 'GLOBAL_WORK_SIZE': maximum global size that can be used to execute a kernel  #OCL2.1!
+    * 'LOCAL_MEM_SIZE': amount of local memory in bytes being used by the kernel
+    * 'PREFERRED_WORK_GROUP_SIZE_MULTIPLE': preferred multiple of workgroup size for launch. This is a performance hint.
+    * 'PRIVATE_MEM_SIZE' Returns the minimum amount of private memory, in bytes, used by each workitem in the kernel
+    * 'WORK_GROUP_SIZE': maximum work-group size that can be used to execute a kernel on a specific device given by device
+    
+    Further information on:
+    https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
+    
+    """
+    assert isinstance(program, pyopencl.Program)
+    if not isinstance(kernel, pyopencl.Kernel):
+        kernel_name = kernel
+        assert kernel in (k.function_name for k in program.all_kernels()), "the kernel exists"
+        kernel = program.__getattr__(kernel_name)
+
+    device = program.devices[0]
+    query_wg = getattr(pyopencl.kernel_work_group_info, what)
+    return kernel.get_work_group_info(query_wg, device)
+
+
+def kernel_workgroup_size(program, kernel):
+    """Extract the compile time maximum workgroup size
+
+    :param program: OpenCL program
+    :param kernel: kernel or name of the kernel
+    :return: the maximum acceptable workgroup size for the given kernel
+    """
+    return query_kernel_info(program, kernel, what="WORK_GROUP_SIZE")
diff --git a/src/silx/opencl/conftest.py b/src/silx/opencl/conftest.py
new file mode 100644
index 0000000..1fdc516
--- /dev/null
+++ b/src/silx/opencl/conftest.py
@@ -0,0 +1,5 @@
+import pytest
+
+@pytest.mark.usefixtures("use_opencl")
+def setup_module(module):
+    pass
diff --git a/src/silx/opencl/convolution.py b/src/silx/opencl/convolution.py
new file mode 100644
index 0000000..15ef931
--- /dev/null
+++ b/src/silx/opencl/convolution.py
@@ -0,0 +1,442 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2019 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for convolution on CPU/GPU."""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["P. Paleo"]
+__license__ = "MIT"
+__date__ = "01/08/2019"
+
+import numpy as np
+from copy import copy  # python2
+from .common import pyopencl as cl
+import pyopencl.array as parray
+from .processing import OpenclProcessing, EventDescription
+from .utils import ConvolutionInfos
+
+class Convolution(OpenclProcessing):
+    """
+    A class for performing convolution on CPU/GPU with OpenCL.
+    """
+
+    def __init__(self, shape, kernel, axes=None, mode=None, ctx=None,
+                 devicetype="all", platformid=None, deviceid=None,
+                 profile=False, extra_options=None):
+        """Constructor of OpenCL Convolution.
+
+        :param shape: shape of the array.
+        :param kernel: convolution kernel (1D, 2D or 3D).
+        :param axes: axes along which the convolution is performed,
+            for batched convolutions.
+        :param mode: Boundary handling mode. Available modes are:
+            "reflect": cba|abcd|dcb
+            "nearest": aaa|abcd|ddd
+            "wrap": bcd|abcd|abc
+            "constant": 000|abcd|000
+            Default is "reflect".
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by
+                           clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param profile: switch on profiling to be able to profile at the kernel
+                        level, store profiling elements (makes code slightly
+                        slower)
+        :param extra_options: Advanced options (dict). Current options are:
+            "allocate_input_array": True,
+            "allocate_output_array": True,
+            "allocate_tmp_array": True,
+            "dont_use_textures": False,
+        """
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  profile=profile)
+
+        self._configure_extra_options(extra_options)
+        self._determine_use_case(shape, kernel, axes)
+        self._allocate_memory(mode)
+        self._init_kernels()
+
+    def _configure_extra_options(self, extra_options):
+        self.extra_options = {
+            "allocate_input_array": True,
+            "allocate_output_array": True,
+            "allocate_tmp_array": True,
+            "dont_use_textures": False,
+        }
+        extra_opts = extra_options or {}
+        self.extra_options.update(extra_opts)
+        self.use_textures = not(self.extra_options["dont_use_textures"])
+        self.use_textures &= self.check_textures_availability()
+
+    def _get_dimensions(self, shape, kernel):
+        self.shape = shape
+        self.data_ndim = self._check_dimensions(shape=shape, name="Data")
+        self.kernel_ndim = self._check_dimensions(arr=kernel, name="Kernel")
+        Nx = shape[-1]
+        if self.data_ndim >= 2:
+            Ny = shape[-2]
+        else:
+            Ny = 1
+        if self.data_ndim >= 3:
+            Nz = shape[-3]
+        else:
+            Nz = 1
+        self.Nx = np.int32(Nx)
+        self.Ny = np.int32(Ny)
+        self.Nz = np.int32(Nz)
+
+    def _determine_use_case(self, shape, kernel, axes):
+        """
+        Determine the convolution use case from the input/kernel shape, and axes.
+        """
+        self._get_dimensions(shape, kernel)
+        if self.kernel_ndim > self.data_ndim:
+            raise ValueError("Kernel dimensions cannot exceed data dimensions")
+        data_ndim = self.data_ndim
+        kernel_ndim = self.kernel_ndim
+        self.kernel = kernel.astype("f")
+
+        convol_infos = ConvolutionInfos()
+        k = (data_ndim, kernel_ndim)
+        if k not in convol_infos.use_cases:
+            raise ValueError(
+                "Cannot find a use case for data ndim = %d and kernel ndim = %d"
+                % (data_ndim, kernel_ndim)
+            )
+        possible_use_cases = convol_infos.use_cases[k]
+
+        self.use_case_name = None
+        for uc_name, uc_params in possible_use_cases.items():
+            if axes in convol_infos.allowed_axes[uc_name]:
+                self.use_case_name = uc_name
+                self.use_case_desc = uc_params["name"]
+                #~ self.use_case_kernels = uc_params["kernels"].copy()
+                self.use_case_kernels = copy(uc_params["kernels"]) # TODO use the above line once we get rid of python2
+        if self.use_case_name is None:
+            raise ValueError(
+                "Cannot find a use case for data ndim = %d, kernel ndim = %d and axes=%s"
+                % (data_ndim, kernel_ndim, str(axes))
+            )
+        # TODO implement this use case
+        if self.use_case_name == "batched_separable_2D_1D_3D":
+            raise NotImplementedError(
+                "The use case %s is not implemented"
+                % self.use_case_name
+            )
+        #
+        self.axes = axes
+        # Replace "axes=None" with an actual value (except for ND-ND)
+        allowed_axes = convol_infos.allowed_axes[self.use_case_name]
+        if len(allowed_axes) > 1:
+            # The default choice might impact perfs
+            self.axes = allowed_axes[0] or allowed_axes[1]
+        self.separable = self.use_case_name.startswith("separable")
+        self.batched = self.use_case_name.startswith("batched")
+        # Update kernel names when using textures
+        if self.use_textures:
+            for i, kern_name in enumerate(self.use_case_kernels):
+                self.use_case_kernels[i] = kern_name + "_tex"
+
+    def _allocate_memory(self, mode):
+        self.mode = mode or "reflect"
+        option_array_names = {
+            "allocate_input_array": "data_in",
+            "allocate_output_array": "data_out",
+            "allocate_tmp_array": "data_tmp",
+        }
+        # Nonseparable transforms do not need tmp array
+        if not(self.separable):
+            self.extra_options["allocate_tmp_array"] = False
+        # Allocate arrays
+        for option_name, array_name in option_array_names.items():
+            if self.extra_options[option_name]:
+                value = parray.empty(self.queue, self.shape, np.float32)
+                value.fill(np.float32(0.0))
+            else:
+                value = None
+            setattr(self, array_name, value)
+
+        if isinstance(self.kernel, np.ndarray):
+            self.d_kernel = parray.to_device(self.queue, self.kernel)
+        else:
+            if not(isinstance(self.kernel, parray.Array)):
+                raise ValueError("kernel must be either numpy array or pyopencl array")
+            self.d_kernel = self.kernel
+        self._old_input_ref = None
+        self._old_output_ref = None
+        if self.use_textures:
+            self._allocate_textures()
+        self._c_modes_mapping = {
+            "periodic": 2,
+            "wrap": 2,
+            "nearest": 1,
+            "replicate": 1,
+            "reflect": 0,
+            "constant": 3,
+        }
+        mp = self._c_modes_mapping
+        if self.mode.lower() not in mp:
+            raise ValueError(
+                """
+                Mode %s is not available for textures. Available modes are:
+                %s
+                """
+                % (self.mode, str(mp.keys()))
+            )
+        # TODO
+        if not(self.use_textures) and self.mode.lower() == "constant":
+            raise NotImplementedError(
+                "mode='constant' is not implemented without textures yet"
+            )
+        #
+        self._c_conv_mode = mp[self.mode]
+
+    def _allocate_textures(self):
+        self.data_in_tex = self.allocate_texture(self.shape)
+        self.d_kernel_tex = self.allocate_texture(self.kernel.shape)
+        self.transfer_to_texture(self.d_kernel, self.d_kernel_tex)
+
+    def _init_kernels(self):
+        if self.kernel_ndim > 1:
+            if np.abs(np.diff(self.kernel.shape)).max() > 0:
+                raise NotImplementedError(
+                    "Non-separable convolution with non-square kernels is not implemented yet"
+                )
+        compile_options = [str("-DUSED_CONV_MODE=%d" % self._c_conv_mode)]
+        if self.use_textures:
+            kernel_files = ["convolution_textures.cl"]
+            compile_options.extend([
+                str("-DIMAGE_DIMS=%d" % self.data_ndim),
+                str("-DFILTER_DIMS=%d" % self.kernel_ndim),
+            ])
+            d_kernel_ref = self.d_kernel_tex
+        else:
+            kernel_files = ["convolution.cl"]
+            d_kernel_ref = self.d_kernel.data
+        self.compile_kernels(
+            kernel_files=kernel_files,
+            compile_options=compile_options
+        )
+        self.ndrange = self.shape[::-1]
+        self.wg = None
+        kernel_args = [
+            self.queue,
+            self.ndrange, self.wg,
+            None,
+            None,
+            d_kernel_ref,
+            np.int32(self.kernel.shape[0]),
+            self.Nx, self.Ny, self.Nz
+        ]
+        if self.kernel_ndim == 2:
+            kernel_args.insert(6, np.int32(self.kernel.shape[1]))
+        if self.kernel_ndim == 3:
+            kernel_args.insert(6, np.int32(self.kernel.shape[2]))
+            kernel_args.insert(7, np.int32(self.kernel.shape[1]))
+        self.kernel_args = tuple(kernel_args)
+        # If self.data_tmp is allocated, separable transforms can be performed
+        # by a series of batched transforms, without any copy, by swapping refs.
+        self.swap_pattern = None
+        if self.separable:
+            if self.data_tmp is not None:
+                self.swap_pattern = {
+                    2: [
+                        ("data_in", "data_tmp"),
+                        ("data_tmp", "data_out")
+                    ],
+                    3: [
+                        ("data_in", "data_out"),
+                        ("data_out", "data_tmp"),
+                        ("data_tmp", "data_out"),
+                    ],
+                }
+            else:
+                # TODO
+                raise NotImplementedError("For now, data_tmp has to be allocated")
+
+    def _get_swapped_arrays(self, i):
+        """
+        Get the input and output arrays to use when using a "swap pattern".
+        Swapping refs enables to avoid copies between temp. array and output.
+        For example, a separable 2D->1D convolution on 2D data reads:
+          data_tmp = convol(data_input, kernel, axis=1) # step i=0
+          data_out = convol(data_tmp, kernel, axis=0) # step i=1
+
+        :param i: current step number of the separable convolution
+        """
+        if self.use_textures:
+            # copy is needed when using texture, as data_out is a Buffer
+            if i > 0:
+                self.transfer_to_texture(self.data_out, self.data_in_tex)
+            return self.data_in_tex, self.data_out
+        n_batchs = len(self.axes)
+        in_ref, out_ref = self.swap_pattern[n_batchs][i]
+        d_in = getattr(self, in_ref)
+        d_out = getattr(self, out_ref)
+        return d_in, d_out
+
+    def _configure_kernel_args(self, opencl_kernel_args, input_ref, output_ref):
+        # TODO more elegant
+        if isinstance(input_ref, parray.Array):
+            input_ref = input_ref.data
+        if isinstance(output_ref, parray.Array):
+            output_ref = output_ref.data
+        if input_ref is not None or output_ref is not None:
+            opencl_kernel_args = list(opencl_kernel_args)
+            if input_ref is not None:
+                opencl_kernel_args[3] = input_ref
+            if output_ref is not None:
+                opencl_kernel_args[4] = output_ref
+            opencl_kernel_args = tuple(opencl_kernel_args)
+        return opencl_kernel_args
+
+    @staticmethod
+    def _check_dimensions(arr=None, shape=None, name="", dim_min=1, dim_max=3):
+        if shape is not None:
+            ndim = len(shape)
+        elif arr is not None:
+            ndim = arr.ndim
+        else:
+            raise ValueError("Please provide either arr= or shape=")
+        if ndim < dim_min or ndim > dim_max:
+            raise ValueError("%s dimensions should be between %d and %d"
+                % (name, dim_min, dim_max)
+            )
+        return ndim
+
+    def _check_array(self, arr):
+        # TODO allow cl.Buffer
+        if not(isinstance(arr, parray.Array) or isinstance(arr, np.ndarray)):
+            raise TypeError("Expected either pyopencl.array.Array or numpy.ndarray")
+        # TODO composition with ImageProcessing/cast
+        if arr.dtype != np.float32:
+            raise TypeError("Data must be float32")
+        if arr.shape != self.shape:
+            raise ValueError("Expected data shape = %s" % str(self.shape))
+
+    def _set_arrays(self, array, output=None):
+        # When using textures: copy
+        if self.use_textures:
+            self.transfer_to_texture(array, self.data_in_tex)
+            data_in_ref = self.data_in_tex
+        else:
+            # Otherwise: copy H->D or update references.
+            if isinstance(array, np.ndarray):
+                self.data_in[:] = array[:]
+            else:
+                self._old_input_ref = self.data_in
+                self.data_in = array
+            data_in_ref = self.data_in
+        if output is not None:
+            if not(isinstance(output, np.ndarray)):
+                self._old_output_ref = self.data_out
+                self.data_out = output
+        # Update OpenCL kernel arguments with new array references
+        self.kernel_args = self._configure_kernel_args(
+            self.kernel_args,
+            data_in_ref,
+            self.data_out
+        )
+
+    def _separable_convolution(self):
+        assert len(self.axes) == len(self.use_case_kernels)
+        # Separable: one kernel call per data dimension
+        for i, axis in enumerate(self.axes):
+            in_ref, out_ref = self._get_swapped_arrays(i)
+            self._batched_convolution(axis, input_ref=in_ref, output_ref=out_ref)
+
+    def _batched_convolution(self, axis, input_ref=None, output_ref=None):
+        # Batched: one kernel call in total
+        opencl_kernel = self.kernels.get_kernel(self.use_case_kernels[axis])
+        opencl_kernel_args = self._configure_kernel_args(
+            self.kernel_args,
+            input_ref,
+            output_ref
+        )
+        ev = opencl_kernel(*opencl_kernel_args)
+        if self.profile:
+            self.events.append(EventDescription("batched convolution", ev))
+
+    def _nd_convolution(self):
+        assert len(self.use_case_kernels) == 1
+        opencl_kernel = self.kernels.get_kernel(self.use_case_kernels[0])
+        ev = opencl_kernel(*self.kernel_args)
+        if self.profile:
+            self.events.append(EventDescription("ND convolution", ev))
+
+    def _recover_arrays_references(self):
+        if self._old_input_ref is not None:
+            self.data_in = self._old_input_ref
+            self._old_input_ref = None
+        if self._old_output_ref is not None:
+            self.data_out = self._old_output_ref
+            self._old_output_ref = None
+        self.kernel_args = self._configure_kernel_args(
+            self.kernel_args,
+            self.data_in,
+            self.data_out
+        )
+
+    def _get_output(self, output):
+        if output is None:
+            res = self.data_out.get()
+        else:
+            res = output
+            if isinstance(output, np.ndarray):
+                output[:] = self.data_out[:]
+        self._recover_arrays_references()
+        return res
+
+    def convolve(self, array, output=None):
+        """
+        Convolve an array with the class kernel.
+
+        :param array: Input array. Can be numpy.ndarray or pyopencl.array.Array.
+        :param output: Output array. Can be numpy.ndarray or pyopencl.array.Array.
+        """
+        self._check_array(array)
+        self._set_arrays(array, output=output)
+        if self.axes is not None:
+            if self.separable:
+                self._separable_convolution()
+            elif self.batched:
+                assert len(self.axes) == 1
+                self._batched_convolution(self.axes[0])
+            # else: ND-ND convol
+        else:
+            # ND-ND convol
+            self._nd_convolution()
+
+        res = self._get_output(output)
+        return res
+
+
+    __call__ = convolve
+
+
diff --git a/src/silx/opencl/image.py b/src/silx/opencl/image.py
new file mode 100644
index 0000000..65e2d5e
--- /dev/null
+++ b/src/silx/opencl/image.py
@@ -0,0 +1,387 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: silx
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#  .
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#  .
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+"""A general purpose library for manipulating 2D images in 1 or 3 colors 
+
+"""
+from __future__ import absolute_import, print_function, with_statement, division
+
+
+__author__ = "Jerome Kieffer"
+__license__ = "MIT"
+__date__ = "12/02/2018"
+__copyright__ = "2012-2017, ESRF, Grenoble"
+__contact__ = "jerome.kieffer@esrf.fr"
+
+import os
+import logging
+import numpy
+from collections import OrderedDict
+from math import floor, ceil, sqrt, log
+
+from .common import pyopencl, kernel_workgroup_size
+from .processing import EventDescription, OpenclProcessing, BufferDescription
+
+if pyopencl:
+    mf = pyopencl.mem_flags
+logger = logging.getLogger(__name__)
+
+
+class ImageProcessing(OpenclProcessing):
+
+    kernel_files = ["cast", "map", "max_min", "histogram"]
+
+    converter = {numpy.dtype(numpy.uint8): "u8_to_float",
+                 numpy.dtype(numpy.int8): "s8_to_float",
+                 numpy.dtype(numpy.uint16): "u16_to_float",
+                 numpy.dtype(numpy.int16): "s16_to_float",
+                 numpy.dtype(numpy.uint32): "u32_to_float",
+                 numpy.dtype(numpy.int32): "s32_to_float",
+                 }
+
+    def __init__(self, shape=None, ncolors=1, template=None,
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 block_size=None, memory=None, profile=False):
+        """Constructor of the ImageProcessing class
+
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param block_size: preferred workgroup size, may vary depending on the
+                            out come of the compilation
+        :param memory: minimum memory available on device
+        :param profile: switch on profiling to be able to profile at the kernel
+                         level, store profiling elements (makes code slightly slower)
+        """
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  block_size=block_size, memory=memory, profile=profile)
+        if template is not None:
+            shape = template.shape
+            if len(shape) > 2:
+                self.ncolors = shape[2]
+                self.shape = shape[:2]
+            else:
+                self.ncolors = 1
+                self.shape = shape
+        else:
+            self.ncolors = ncolors
+            self.shape = shape
+            assert shape is not None
+        self.buffer_shape = self.shape if self.ncolors == 1 else self.shape + (self.ncolors,)
+        kernel_files = [os.path.join("image", i) for i in self.kernel_files]
+        self.compile_kernels(kernel_files,
+                             compile_options="-DNB_COLOR=%i" % self.ncolors)
+        if self.ncolors == 1:
+            img_shape = self.shape
+        else:
+            img_shape = self.shape + (self.ncolors,)
+
+        buffers = [BufferDescription("image0_d", img_shape, numpy.float32, None),
+                   BufferDescription("image1_d", img_shape, numpy.float32, None),
+                   BufferDescription("image2_d", img_shape, numpy.float32, None),
+                   BufferDescription("max_min_d", 2, numpy.float32, None),
+                   BufferDescription("cnt_d", 1, numpy.int32, None), ]
+        # Temporary buffer for max-min reduction
+        self.wg_red = kernel_workgroup_size(self.program, self.kernels.max_min_reduction_stage1)
+        if self.wg_red > 1:
+            self.wg_red = min(self.wg_red,
+                              numpy.int32(1 << int(floor(log(sqrt(numpy.prod(self.shape)), 2)))))
+            tmp = BufferDescription("tmp_max_min_d", 2 * self.wg_red, numpy.float32, None)
+            buffers.append(tmp)
+        self.allocate_buffers(buffers, use_array=True)
+        self.cl_mem["cnt_d"].fill(0)
+
+    def __repr__(self):
+        return "ImageProcessing for shape=%s, %i colors initalized on %s" % \
+            (self.shape, self.ncolors, self.ctx.devices[0].name)
+
+    def _get_in_out_buffers(self, img=None, copy=True, out=None,
+                            out_dtype=None, out_size=None):
+        """Internal method used to select the proper buffers before processing.
+
+        :param img: expects a numpy array or a pyopencl.array of dim 2 or 3
+        :param copy: set to False to directly re-use a pyopencl array
+        :param out: provide an output buffer to store the result
+        :param out_dtype: enforce the type of the output buffer (optional)
+        :param out_size: enforce the size of the output buffer (optional)
+        :return: input_buffer, output_buffer
+        
+        Nota: this is not locked.
+        """
+        events = []
+        if out is not None and isinstance(out, pyopencl.array.Array):
+            if (out_size or out_dtype) is not None:
+                if out_size is not None:
+                    assert out.size > out_size
+                if out_dtype is not None:
+                    assert out_dtype == out.dtype
+            else:  # assume it is same size and type as weoking buffer
+                assert out.shape == self.buffer_shape
+                assert out.dtype == numpy.float32
+            out.finish()
+            output_array = out
+        else:
+            if out_dtype != numpy.float32 and out_size:
+                name = "%s_%s_d" % (numpy.dtype(out_dtype), out_size)
+                if name not in self.cl_mem:
+                    output_array = self.cl_mem[name] = pyopencl.array.empty(self.queue, (out_size,), out_dtype)
+                else:
+                    output_array = self.cl_mem[name]
+            else:
+                output_array = self.cl_mem["image2_d"]
+
+        if img is None:
+            input_array = self.cl_mem["image1_d"]
+        if isinstance(img, pyopencl.array.Array):
+            if copy:
+                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, img.data)
+                input_array = self.cl_mem["image1_d"]
+                events.append(EventDescription("copy D->D", evt))
+            else:
+                img.finish()
+                input_array = img
+                evt = None
+        else:
+            # assume this is numpy
+            if img.dtype.itemsize > 4:
+                logger.warning("Casting to float32 on CPU")
+                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, numpy.ascontiguousarray(img, numpy.float32))
+                input_array = self.cl_mem["image1_d"]
+                events.append(EventDescription("cast+copy H->D", evt))
+            else:
+                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, numpy.ascontiguousarray(img))
+                input_array = self.cl_mem["image1_d"]
+                events.append(EventDescription("copy H->D", evt))
+        if self.profile:
+            self.events += events
+        return input_array, output_array
+
+    def to_float(self, img, copy=True, out=None):
+        """ Takes any array and convert it to a float array for ease of processing.
+        
+        :param img: expects a numpy array or a pyopencl.array of dim 2 or 3
+        :param copy: set to False to directly re-use a pyopencl array
+        :param out: provide an output buffer to store the result
+        """
+        assert img.shape == self.buffer_shape
+
+        events = []
+        with self.sem:
+            input_array, output_array = self._get_in_out_buffers(img, copy, out)
+            if (img.dtype.itemsize > 4) or (img.dtype == numpy.float32):
+                # copy device -> device, already there as float32
+                ev = pyopencl.enqueue_copy(self.queue, output_array.data, input_array.data)
+                events.append(EventDescription("copy D->D", ev))
+            else:
+                # Cast to float:
+                name = self.converter[img.dtype]
+                kernel = self.kernels.get_kernel(name)
+                ev = kernel(self.queue, (self.shape[1], self.shape[0]), None,
+                            input_array.data, output_array.data,
+                            numpy.int32(self.shape[1]), numpy.int32(self.shape[0])
+                            )
+                events.append(EventDescription("cast %s" % name, ev))
+
+        if self.profile:
+            self.events += events
+        if out is None:
+            res = output_array.get()
+            return res
+        else:
+            output_array.finish()
+            return output_array
+
+    def normalize(self, img, mini=0.0, maxi=1.0, copy=True, out=None):
+        """Scale the intensity of the image so that the minimum is 0 and the
+        maximum is 1.0 (or any value suggested).
+        
+        :param img: numpy array or pyopencl array of dim 2 or 3 and of type float
+        :param mini: Expected minimum value
+        :param maxi: expected maxiumum value
+        :param copy: set to False to use directly the input buffer
+        :param out: provides an output buffer. prevents a copy D->H
+        
+        This uses a min/max reduction in two stages plus a map operation  
+        """
+        assert img.shape == self.buffer_shape
+        events = []
+        with self.sem:
+            input_array, output_array = self._get_in_out_buffers(img, copy, out)
+            size = numpy.int32(numpy.prod(self.shape))
+            if self.wg_red == 1:
+                #  Probably on MacOS CPU WG==1 --> serial code.
+                kernel = self.kernels.get_kernel("max_min_serial")
+                evt = kernel(self.queue, (1,), (1,),
+                             input_array.data,
+                             size,
+                             self.cl_mem["max_min_d"].data)
+                ed = EventDescription("max_min_serial", evt)
+                events.append(ed)
+            else:
+                stage1 = self.kernels.max_min_reduction_stage1
+                stage2 = self.kernels.max_min_reduction_stage2
+                local_mem = pyopencl.LocalMemory(int(self.wg_red * 8))
+                k1 = stage1(self.queue, (int(self.wg_red ** 2),), (int(self.wg_red),),
+                            input_array.data,
+                            self.cl_mem["tmp_max_min_d"].data,
+                            size,
+                            local_mem)
+                k2 = stage2(self.queue, (int(self.wg_red),), (int(self.wg_red),),
+                            self.cl_mem["tmp_max_min_d"].data,
+                            self.cl_mem["max_min_d"].data,
+                            local_mem)
+
+                events += [EventDescription("max_min_stage1", k1),
+                           EventDescription("max_min_stage2", k2)]
+
+            evt = self.kernels.normalize_image(self.queue, (self.shape[1], self.shape[0]), None,
+                                               input_array.data, output_array.data,
+                                               numpy.int32(self.shape[1]), numpy.int32(self.shape[0]),
+                                               self.cl_mem["max_min_d"].data,
+                                               numpy.float32(mini), numpy.float32(maxi))
+            events.append(EventDescription("normalize", evt))
+        if self.profile:
+            self.events += events
+
+        if out is None:
+            res = output_array.get()
+            return res
+        else:
+            output_array.finish()
+            return output_array
+
+    def histogram(self, img=None, nbins=255, range=None,
+                  log_scale=False, copy=True, out=None):
+        """Compute the histogram of a set of data.
+        
+        :param img: input image. If None, use the one already on the device
+        :param nbins: number of bins
+        :param range: the lower and upper range of the bins.  If not provided, 
+                    range is simply ``(a.min(), a.max())``.  Values outside the 
+                    range are ignored. The first element of the range must be 
+                    less than or equal to the second.
+        :param log_scale: perform the binning in lograrithmic scale. 
+                         Open to extension
+        :param copy: unset to directly use the input buffer without copy
+        :param out: use a provided array for offering the result 
+        :return: histogram (size=nbins), edges (size=nbins+1)
+        API similar to numpy  
+        """
+        assert img.shape == self.buffer_shape
+
+        input_array = self.to_float(img, copy=copy, out=self.cl_mem["image0_d"])
+        events = []
+        with self.sem:
+            input_array, output_array = self._get_in_out_buffers(input_array, copy=False,
+                                                                 out=out,
+                                                                 out_dtype=numpy.int32,
+                                                                 out_size=nbins)
+
+            if range is None:
+                # measure actually the bounds
+                size = numpy.int32(numpy.prod(self.shape))
+                if self.wg_red == 1:
+                    #  Probably on MacOS CPU WG==1 --> serial code.
+                    kernel = self.kernels.get_kernel("max_min_serial")
+
+                    evt = kernel(self.queue, (1,), (1,),
+                                 input_array.data,
+                                 size,
+                                 self.cl_mem["max_min_d"].data)
+                    events.append(EventDescription("max_min_serial", evt))
+                else:
+                    stage1 = self.kernels.max_min_reduction_stage1
+                    stage2 = self.kernels.max_min_reduction_stage2
+                    local_mem = pyopencl.LocalMemory(int(self.wg_red * 2 * numpy.dtype("float32").itemsize))
+                    k1 = stage1(self.queue, (int(self.wg_red ** 2),), (int(self.wg_red),),
+                                input_array.data,
+                                self.cl_mem["tmp_max_min_d"].data,
+                                size,
+                                local_mem)
+                    k2 = stage2(self.queue, (int(self.wg_red),), (int(self.wg_red),),
+                                self.cl_mem["tmp_max_min_d"].data,
+                                self.cl_mem["max_min_d"].data,
+                                local_mem)
+
+                    events += [EventDescription("max_min_stage1", k1),
+                               EventDescription("max_min_stage2", k2)]
+                maxi, mini = self.cl_mem["max_min_d"].get()
+            else:
+                mini = numpy.float32(min(range))
+                maxi = numpy.float32(max(range))
+            device = self.ctx.devices[0]
+            nb_engines = device.max_compute_units
+            tmp_size = nb_engines * nbins
+            name = "tmp_int32_%s_d" % (tmp_size)
+            if name not in self.cl_mem:
+                tmp_array = self.cl_mem[name] = pyopencl.array.empty(self.queue, (tmp_size,), numpy.int32)
+            else:
+                tmp_array = self.cl_mem[name]
+
+            edge_name = "tmp_float32_%s_d" % (nbins + 1)
+            if edge_name not in self.cl_mem:
+                edges_array = self.cl_mem[edge_name] = pyopencl.array.empty(self.queue, (nbins + 1,), numpy.float32)
+            else:
+                edges_array = self.cl_mem[edge_name]
+
+            shared = pyopencl.LocalMemory(numpy.dtype(numpy.int32).itemsize * nbins)
+
+            # Handle log-scale
+            if log_scale:
+                map_operation = numpy.int32(1)
+            else:
+                map_operation = numpy.int32(0)
+            kernel = self.kernels.get_kernel("histogram")
+            wg = min(device.max_work_group_size,
+                     1 << (int(ceil(log(nbins, 2)))),
+                     self.kernels.max_workgroup_size(kernel))
+            evt = kernel(self.queue, (wg * nb_engines,), (wg,),
+                         input_array.data,
+                         numpy.int32(input_array.size),
+                         mini,
+                         maxi,
+                         map_operation,
+                         output_array.data,
+                         edges_array.data,
+                         numpy.int32(nbins),
+                         tmp_array.data,
+                         self.cl_mem["cnt_d"].data,
+                         shared)
+            events.append(EventDescription("histogram", evt))
+
+        if self.profile:
+            self.events += events
+
+        if out is None:
+            res = output_array.get()
+            return res, edges_array.get()
+        else:
+            output_array.finish()
+            return output_array, edges_array
diff --git a/src/silx/opencl/linalg.py b/src/silx/opencl/linalg.py
new file mode 100644
index 0000000..a64122a
--- /dev/null
+++ b/src/silx/opencl/linalg.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for basic linear algebra in OpenCL"""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["P. Paleo"]
+__license__ = "MIT"
+__date__ = "01/08/2019"
+
+import numpy as np
+
+from .common import pyopencl
+from .processing import EventDescription, OpenclProcessing
+
+import pyopencl.array as parray
+cl = pyopencl
+
+
+class LinAlg(OpenclProcessing):
+
+    kernel_files = ["linalg.cl"]
+
+    def __init__(self, shape, do_checks=False, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False):
+        """
+        Create a "Linear Algebra" plan for a given image shape.
+
+        :param shape: shape of the image (num_rows, num_columns)
+        :param do_checks (optional): if True, memory and data type checks are performed when possible.
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param profile: switch on profiling to be able to profile at the kernel level,
+                        store profiling elements (makes code slightly slower)
+
+        """
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  profile=profile)
+
+        self.d_gradient = parray.empty(self.queue, shape, np.complex64)
+        self.d_gradient.fill(np.complex64(0.0))
+        self.d_image = parray.empty(self.queue, shape, np.float32)
+        self.d_image.fill(np.float32(0.0))
+        self.add_to_cl_mem({
+            "d_gradient": self.d_gradient,
+            "d_image": self.d_image
+        })
+
+        self.wg2D = None
+        self.shape = shape
+        self.ndrange2D = (
+            int(self.shape[1]),
+            int(self.shape[0])
+        )
+        self.do_checks = bool(do_checks)
+        OpenclProcessing.compile_kernels(self, self.kernel_files)
+
+    @staticmethod
+    def check_array(array, dtype, shape, arg_name):
+        if array.shape != shape or array.dtype != dtype:
+            raise ValueError("%s should be a %s array of type %s" %(arg_name, str(shape), str(dtype)))
+
+    def get_data_references(self, src, dst, default_src_ref, default_dst_ref):
+        """
+        From various types of src and dst arrays,
+        returns the references to the underlying data (Buffer) that will be used by the OpenCL kernels.
+        # TODO documentation
+
+        This function will make a copy host->device if the input is on host (eg. numpy array)
+        """
+        if dst is not None:
+            if isinstance(dst, cl.array.Array):
+                dst_ref = dst.data
+            elif isinstance(dst, cl.Buffer):
+                dst_ref = dst
+            else:
+                raise ValueError("dst should be either pyopencl.array.Array or pyopencl.Buffer")
+        else:
+            dst_ref = default_dst_ref
+
+        if isinstance(src, cl.array.Array):
+            src_ref = src.data
+        elif isinstance(src, cl.Buffer):
+            src_ref = src
+        else:  # assuming numpy.ndarray
+            evt = cl.enqueue_copy(self.queue, default_src_ref, src)
+            self.events.append(EventDescription("copy H->D", evt))
+            src_ref = default_src_ref
+        return src_ref, dst_ref
+
+    def gradient(self, image, dst=None, return_to_host=False):
+        """
+        Compute the spatial gradient of an image.
+        The gradient is computed with first-order difference (not central difference).
+
+        :param image: image to compute the gradient from. It can be either a numpy.ndarray, a pyopencl Array or Buffer.
+        :param dst: optional, reference to a destination pyopencl Array or Buffer. It must be of complex64 data type.
+        :param return_to_host: optional, set to True if you want the result to be transferred back to host.
+
+        if dst is provided, it should be of type numpy.complex64 !
+        """
+        n_y, n_x = np.int32(self.shape)
+        if self.do_checks:
+            self.check_array(image, np.float32, self.shape, "image")
+            if dst is not None:
+                self.check_array(dst, np.complex64, self.shape, "dst")
+        img_ref, grad_ref = self.get_data_references(image, dst, self.d_image.data, self.d_gradient.data)
+
+        # Prepare the kernel call
+        kernel_args = [
+            img_ref,
+            grad_ref,
+            n_x,
+            n_y
+        ]
+        # Call the gradient kernel
+        evt = self.kernels.kern_gradient2D(
+            self.queue,
+            self.ndrange2D,
+            self.wg2D,
+            *kernel_args
+        )
+        self.events.append(EventDescription("gradient2D", evt))
+        # TODO: should the wait be done in any case ?
+        # In the case where dst=None, the wait() is mandatory since a user will be doing arithmetic on dst afterwards
+        if dst is None:
+            evt.wait()
+
+        if return_to_host:
+            if dst is not None:
+                res_tmp = self.d_gradient.get()
+            else:
+                res_tmp = np.zeros(self.shape, dtype=np.complex64)
+                cl.enqueue_copy(self.queue, res_tmp, grad_ref)
+            res = np.zeros((2,) + self.shape, dtype=np.float32)
+            res[0] = np.copy(res_tmp.real)
+            res[1] = np.copy(res_tmp.imag)
+            return res
+        else:
+            return dst
+
+    def divergence(self, gradient, dst=None, return_to_host=False):
+        """
+        Compute the spatial divergence of an image.
+        The divergence is designed to be the (negative) adjoint of the gradient.
+
+        :param gradient: gradient-like array to compute the divergence from. It can be either a numpy.ndarray, a pyopencl Array or Buffer.
+        :param dst: optional, reference to a destination pyopencl Array or Buffer. It must be of complex64 data type.
+        :param return_to_host: optional, set to True if you want the result to be transferred back to host.
+
+        if dst is provided, it should be of type numpy.complex64 !
+        """
+        n_y, n_x = np.int32(self.shape)
+        # numpy.ndarray gradients are expected to be (2, n_y, n_x)
+        if isinstance(gradient, np.ndarray):
+            gradient2 = np.zeros(self.shape, dtype=np.complex64)
+            gradient2.real = np.copy(gradient[0])
+            gradient2.imag = np.copy(gradient[1])
+            gradient = gradient2
+        elif self.do_checks:
+            self.check_array(gradient, np.complex64, self.shape, "gradient")
+            if dst is not None:
+                self.check_array(dst, np.float32, self.shape, "dst")
+        grad_ref, img_ref = self.get_data_references(gradient, dst, self.d_gradient.data, self.d_image.data)
+
+        # Prepare the kernel call
+        kernel_args = [
+            grad_ref,
+            img_ref,
+            n_x,
+            n_y
+        ]
+        # Call the gradient kernel
+        evt = self.kernels.kern_divergence2D(
+            self.queue,
+            self.ndrange2D,
+            self.wg2D,
+            *kernel_args
+        )
+        self.events.append(EventDescription("divergence2D", evt))
+        # TODO: should the wait be done in any case ?
+        # In the case where dst=None, the wait() is mandatory since a user will be doing arithmetic on dst afterwards
+        if dst is None:
+            evt.wait()
+
+        if return_to_host:
+            if dst is not None:
+                res = self.d_image.get()
+            else:
+                res = np.zeros(self.shape, dtype=np.float32)
+                cl.enqueue_copy(self.queue, res, img_ref)
+            return res
+        else:
+            return dst
diff --git a/src/silx/opencl/medfilt.py b/src/silx/opencl/medfilt.py
new file mode 100644
index 0000000..d4e425b
--- /dev/null
+++ b/src/silx/opencl/medfilt.py
@@ -0,0 +1,269 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: Azimuthal integration
+#             https://github.com/silx-kit/pyFAI
+#
+#    Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#  .
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#  .
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+"""A module for performing the 1d, 2d and 3d median filter ...
+
+The target is to mimic the signature of scipy.signal.medfilt and scipy.medfilt2
+
+The first implementation targets 2D implementation where this operation is costly (~10s/2kx2k image)
+"""
+from __future__ import absolute_import, print_function, with_statement, division
+
+
+__author__ = "Jerome Kieffer"
+__license__ = "MIT"
+__date__ = "12/09/2017"
+__copyright__ = "2012-2017, ESRF, Grenoble"
+__contact__ = "jerome.kieffer@esrf.fr"
+
+import logging
+import numpy
+from collections import OrderedDict
+
+from .common import pyopencl, kernel_workgroup_size
+from .processing import EventDescription, OpenclProcessing, BufferDescription
+
+if pyopencl:
+    mf = pyopencl.mem_flags
+else:
+    raise ImportError("pyopencl is not installed")
+logger = logging.getLogger(__name__)
+
+
+class MedianFilter2D(OpenclProcessing):
+    """A class for doing median filtering using OpenCL"""
+    buffers = [
+               BufferDescription("result", 1, numpy.float32, mf.WRITE_ONLY),
+               BufferDescription("image_raw", 1, numpy.float32, mf.READ_ONLY),
+               BufferDescription("image", 1, numpy.float32, mf.READ_WRITE),
+               ]
+    kernel_files = ["preprocess.cl", "bitonic.cl", "medfilt.cl"]
+    mapping = {numpy.int8: "s8_to_float",
+               numpy.uint8: "u8_to_float",
+               numpy.int16: "s16_to_float",
+               numpy.uint16: "u16_to_float",
+               numpy.uint32: "u32_to_float",
+               numpy.int32: "s32_to_float"}
+
+    def __init__(self, shape, kernel_size=(3, 3),
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 block_size=None, profile=False
+                 ):
+        """Constructor of the OpenCL 2D median filtering class
+
+        :param shape: shape of the images to treat
+        :param kernel size: 2-tuple of odd values
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param block_size: preferred workgroup size, may vary depending on the outpcome of the compilation
+        :param profile: switch on profiling to be able to profile at the kernel level,
+                        store profiling elements (makes code slightly slower)
+        """
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  block_size=block_size, profile=profile)
+        self.shape = shape
+        self.size = self.shape[0] * self.shape[1]
+        self.kernel_size = self.calc_kernel_size(kernel_size)
+        self.workgroup_size = (self.calc_wg(self.kernel_size), 1)  # 3D kernel
+        self.buffers = [BufferDescription(i.name, i.size * self.size, i.dtype, i.flags)
+                        for i in self.__class__.buffers]
+
+        self.allocate_buffers()
+        self.local_mem = self._get_local_mem(self.workgroup_size[0])
+        OpenclProcessing.compile_kernels(self, self.kernel_files, "-D NIMAGE=%i" % self.size)
+        self.set_kernel_arguments()
+
+    def set_kernel_arguments(self):
+        """Parametrize all kernel arguments
+        """
+        for val in self.mapping.values():
+            self.cl_kernel_args[val] = OrderedDict(((i, self.cl_mem[i]) for i in ("image_raw", "image")))
+        self.cl_kernel_args["medfilt2d"] = OrderedDict((("image", self.cl_mem["image"]),
+                                                        ("result", self.cl_mem["result"]),
+                                                        ("local", self.local_mem),
+                                                        ("khs1", numpy.int32(self.kernel_size[0] // 2)),  # Kernel half-size along dim1 (lines)
+                                                        ("khs2", numpy.int32(self.kernel_size[1] // 2)),  # Kernel half-size along dim2 (columns)
+                                                        ("height", numpy.int32(self.shape[0])),  # Image size along dim1 (lines)
+                                                        ("width", numpy.int32(self.shape[1]))))
+#                                                         ('debug', self.cl_mem["debug"])))  # Image size along dim2 (columns))
+
+    def _get_local_mem(self, wg):
+        return pyopencl.LocalMemory(wg * 32)  # 4byte per float, 8 element per thread
+
+    def send_buffer(self, data, dest):
+        """Send a numpy array to the device, including the cast on the device if possible
+
+        :param data: numpy array with data
+        :param dest: name of the buffer as registered in the class
+        """
+
+        dest_type = numpy.dtype([i.dtype for i in self.buffers if i.name == dest][0])
+        events = []
+        if (data.dtype == dest_type) or (data.dtype.itemsize > dest_type.itemsize):
+            copy_image = pyopencl.enqueue_copy(self.queue, self.cl_mem[dest], numpy.ascontiguousarray(data, dest_type))
+            events.append(EventDescription("copy H->D %s" % dest, copy_image))
+        else:
+            copy_image = pyopencl.enqueue_copy(self.queue, self.cl_mem["image_raw"], numpy.ascontiguousarray(data))
+            kernel = getattr(self.program, self.mapping[data.dtype.type])
+            cast_to_float = kernel(self.queue, (self.size,), None, self.cl_mem["image_raw"], self.cl_mem[dest])
+            events += [EventDescription("copy H->D %s" % dest, copy_image), EventDescription("cast to float", cast_to_float)]
+        if self.profile:
+            self.events += events
+
+    def calc_wg(self, kernel_size):
+        """calculate and return the optimal workgroup size for the first dimension, taking into account
+        the 8-height band
+
+        :param kernel_size: 2-tuple of int, shape of the median window
+        :return: optimal workgroup size
+        """
+        needed_threads = ((kernel_size[0] + 7) // 8) * kernel_size[1]
+        if needed_threads < 8:
+            wg = 8
+        elif needed_threads < 32:
+            wg = 32
+        else:
+            wg = 1 << (int(needed_threads).bit_length())
+        return wg
+
+    def medfilt2d(self, image, kernel_size=None):
+        """Actually apply the median filtering on the image
+
+        :param image: numpy array with the image
+        :param kernel_size: 2-tuple if
+        :return: median-filtered  2D image
+
+
+        Nota: for window size 1x1 -> 7x7     up to 49  /  64 elements in   8 threads, 8elt/th
+                              9x9 -> 15x15   up to 225 / 256 elements in  32 threads, 8elt/th
+                              17x17 -> 21x21 up to 441 / 512 elements in  64 threads, 8elt/th
+
+        TODO: change window size on the fly,
+
+
+        """
+        events = []
+        if kernel_size is None:
+            kernel_size = self.kernel_size
+        else:
+            kernel_size = self.calc_kernel_size(kernel_size)
+        kernel_half_size = kernel_size // numpy.int32(2)
+        # this is the workgroup size
+        wg = self.calc_wg(kernel_size)
+
+        # check for valid work group size:
+        amws = kernel_workgroup_size(self.program, "medfilt2d")
+        logger.warning("max actual workgroup size: %s, expected: %s", amws, wg)
+        if wg > amws:
+            raise RuntimeError("Workgroup size is too big for medfilt2d: %s>%s" % (wg, amws))
+
+        localmem = self._get_local_mem(wg)
+
+        assert image.ndim == 2, "Treat only 2D images"
+        assert image.shape[0] <= self.shape[0], "height is OK"
+        assert image.shape[1] <= self.shape[1], "width is OK"
+
+        with self.sem:
+            self.send_buffer(image, "image")
+
+            kwargs = self.cl_kernel_args["medfilt2d"]
+            kwargs["local"] = localmem
+            kwargs["khs1"] = kernel_half_size[0]
+            kwargs["khs2"] = kernel_half_size[1]
+            kwargs["height"] = numpy.int32(image.shape[0])
+            kwargs["width"] = numpy.int32(image.shape[1])
+#             for k, v in kwargs.items():
+#                 print("%s: %s (%s)" % (k, v, type(v)))
+            mf2d = self.kernels.medfilt2d(self.queue,
+                                          (wg, image.shape[1]),
+                                          (wg, 1), *list(kwargs.values()))
+            events.append(EventDescription("median filter 2d", mf2d))
+
+            result = numpy.empty(image.shape, numpy.float32)
+            ev = pyopencl.enqueue_copy(self.queue, result, self.cl_mem["result"])
+            events.append(EventDescription("copy D->H result", ev))
+            ev.wait()
+        if self.profile:
+            self.events += events
+        return result
+    __call__ = medfilt2d
+
+    @staticmethod
+    def calc_kernel_size(kernel_size):
+        """format the kernel size to be a 2-length numpy array of int32
+        """
+        kernel_size = numpy.asarray(kernel_size, dtype=numpy.int32)
+        if kernel_size.shape == ():
+            kernel_size = numpy.repeat(kernel_size.item(), 2).astype(numpy.int32)
+        for size in kernel_size:
+            if (size % 2) != 1:
+                raise ValueError("Each element of kernel_size should be odd.")
+        return kernel_size
+
+
+class _MedFilt2d(object):
+    median_filter = None
+
+    @classmethod
+    def medfilt2d(cls, ary, kernel_size=3):
+        """Median filter a 2-dimensional array.
+
+        Apply a median filter to the `input` array using a local window-size
+        given by `kernel_size` (must be odd).
+
+        :param ary: A 2-dimensional input array.
+        :param kernel_size: A scalar or a list of length 2, giving the size of the
+                            median filter window in each dimension.  Elements of
+                            `kernel_size` should be odd.  If `kernel_size` is a scalar,
+                            then this scalar is used as the size in each dimension.
+                            Default is a kernel of size (3, 3).
+        :return: An array the same size as input containing the median filtered
+                result. always work on float32 values
+
+        About the padding:
+
+        * The filling mode in scipy.signal.medfilt2d is zero-padding
+        * This implementation is equivalent to:
+            scipy.ndimage.filters.median_filter(ary, kernel_size, mode="nearest")
+
+        """
+        image = numpy.atleast_2d(ary)
+        shape = numpy.array(image.shape)
+        if cls.median_filter is None:
+            cls.median_filter = MedianFilter2D(image.shape, kernel_size)
+        elif (numpy.array(cls.median_filter.shape) < shape).any():
+            # enlarger the buffer size
+            new_shape = numpy.maximum(numpy.array(cls.median_filter.shape), shape)
+            ctx = cls.median_filter.ctx
+            cls.median_filter = MedianFilter2D(new_shape, kernel_size, ctx=ctx)
+        return cls.median_filter.medfilt2d(image, kernel_size=kernel_size)
+
+medfilt2d = _MedFilt2d.medfilt2d
diff --git a/src/silx/opencl/processing.py b/src/silx/opencl/processing.py
new file mode 100644
index 0000000..8b81f7f
--- /dev/null
+++ b/src/silx/opencl/processing.py
@@ -0,0 +1,447 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: S I L X project
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2018 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+"""
+Common OpenCL abstract base classe for different processing
+"""
+
+__author__ = "Jerome Kieffer"
+__contact__ = "Jerome.Kieffer@ESRF.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "02/03/2021"
+__status__ = "stable"
+
+import sys
+import os
+import logging
+import gc
+from collections import namedtuple, OrderedDict
+import numpy
+import threading
+from .common import ocl, pyopencl, release_cl_buffers, query_kernel_info, allocate_texture, check_textures_availability
+from .utils import concatenate_cl_kernel
+import platform
+
+BufferDescription = namedtuple("BufferDescription", ["name", "size", "dtype", "flags"])
+EventDescription = namedtuple("EventDescription", ["name", "event"])
+
+logger = logging.getLogger(__name__)
+
+
+class KernelContainer(object):
+    """Those object holds a copy of all kernels accessible as attributes"""
+
+    def __init__(self, program):
+        """Constructor of the class
+
+        :param program: the OpenCL program as generated by PyOpenCL
+        """
+        self._program = program
+        for kernel in program.all_kernels():
+            self.__setattr__(kernel.function_name, kernel)
+
+    def get_kernels(self):
+        "return the dictionary with all kernels"
+        return dict(item for item in self.__dict__.items()
+                    if not item[0].startswith("_"))
+
+    def get_kernel(self, name):
+        "get a kernel from its name"
+        logger.debug("KernelContainer.get_kernel(%s)", name)
+        return self.__dict__.get(name)
+
+    def max_workgroup_size(self, kernel_name):
+        "Retrieve the compile time WORK_GROUP_SIZE for a given kernel"
+        if isinstance(kernel_name, pyopencl.Kernel):
+            kernel = kernel_name
+        else:
+            kernel = self.get_kernel(kernel_name)
+
+        return query_kernel_info(self._program, kernel, "WORK_GROUP_SIZE")
+
+    def min_workgroup_size(self, kernel_name):
+        "Retrieve the compile time PREFERRED_WORK_GROUP_SIZE_MULTIPLE for a given kernel"
+        if isinstance(kernel_name, pyopencl.Kernel):
+            kernel = kernel_name
+        else:
+            kernel = self.get_kernel(kernel_name)
+
+        return query_kernel_info(self._program, kernel, "PREFERRED_WORK_GROUP_SIZE_MULTIPLE")
+
+
+class OpenclProcessing(object):
+    """Abstract class for different types of OpenCL processing.
+
+    This class provides:
+    * Generation of the context, queues, profiling mode
+    * Additional function to allocate/free all buffers declared as static attributes of the class
+    * Functions to compile kernels, cache them and clean them
+    * helper functions to clone the object
+    """
+    # Example of how to create an output buffer of 10 floats
+    buffers = [BufferDescription("output", 10, numpy.float32, None),
+               ]
+    # list of kernel source files to be concatenated before compilation of the program
+    kernel_files = []
+
+    def __init__(self, ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 block_size=None, memory=None, profile=False):
+        """Constructor of the abstract OpenCL processing class
+
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param block_size: preferred workgroup size, may vary depending on the
+                            out come of the compilation
+        :param memory: minimum memory available on device
+        :param profile: switch on profiling to be able to profile at the kernel
+                         level, store profiling elements (makes code slightly slower)
+        """
+        self.sem = threading.Semaphore()
+        self._X87_VOLATILE = None
+        self.profile = None
+        self.events = []  # List with of EventDescription, kept for profiling
+        self.cl_mem = {}  # dict with all buffer allocated
+        self.cl_program = None  # The actual OpenCL program
+        self.cl_kernel_args = {}  # dict with all kernel arguments
+        self.queue = None
+        if ctx:
+            self.ctx = ctx
+        else:
+            self.ctx = ocl.create_context(devicetype=devicetype,
+                                          platformid=platformid, deviceid=deviceid,
+                                          memory=memory)
+        device_name = self.ctx.devices[0].name.strip()
+        platform_name = self.ctx.devices[0].platform.name.strip()
+        platform = ocl.get_platform(platform_name)
+        self.device = platform.get_device(device_name)
+        self.cl_kernel_args = {}  # dict with all kernel arguments
+
+        self.set_profiling(profile)
+        self.block_size = block_size
+        self.program = None
+        self.kernels = None
+
+    def check_textures_availability(self):
+        return check_textures_availability(self.ctx)
+
+    def __del__(self):
+        """Destructor: release all buffers and programs
+        """
+        try:
+            self.reset_log()
+            self.free_kernels()
+            self.free_buffers()
+            if self.queue is not None:
+                self.queue.finish()
+        except Exception as err:
+            logger.warning("%s: %s", type(err), err)
+        self.queue = None
+        self.device = None
+        self.ctx = None
+        gc.collect()
+
+    def allocate_buffers(self, buffers=None, use_array=False):
+        """
+        Allocate OpenCL buffers required for a specific configuration
+
+        :param buffers: a list of BufferDescriptions, leave to None for
+                        paramatrized buffers.
+        :param use_array: allocate memory as pyopencl.array.Array
+                            instead of pyopencl.Buffer
+
+        Note that an OpenCL context also requires some memory, as well
+        as Event and other OpenCL functionalities which cannot and are
+        not taken into account here.  The memory required by a context
+        varies depending on the device. Typical for GTX580 is 65Mb but
+        for a 9300m is ~15Mb In addition, a GPU will always have at
+        least 3-5Mb of memory in use.  Unfortunately, OpenCL does NOT
+        have a built-in way to check the actual free memory on a
+        device, only the total memory.
+        """
+        if buffers is None:
+            buffers = self.buffers
+
+        with self.sem:
+            mem = {}
+
+            # check if enough memory is available on the device
+            ualloc = 0
+            for buf in buffers:
+                ualloc += numpy.dtype(buf.dtype).itemsize * numpy.prod(buf.size)
+            logger.info("%.3fMB are needed on device: %s,  which has %.3fMB",
+                        ualloc / 1.0e6, self.device, self.device.memory / 1.0e6)
+
+            if ualloc >= self.device.memory:
+                raise MemoryError("Fatal error in allocate_buffers. Not enough "
+                                  " device memory for buffers (%lu requested, %lu available)"
+                                  % (ualloc, self.device.memory))
+
+            # do the allocation
+            try:
+                if use_array:
+                    for buf in buffers:
+                        mem[buf.name] = pyopencl.array.empty(self.queue, buf.size, buf.dtype)
+                else:
+                    for buf in buffers:
+                        size = numpy.dtype(buf.dtype).itemsize * numpy.prod(buf.size)
+                        mem[buf.name] = pyopencl.Buffer(self.ctx, buf.flags, int(size))
+            except pyopencl.MemoryError as error:
+                release_cl_buffers(mem)
+                raise MemoryError(error)
+
+        self.cl_mem.update(mem)
+
+    def add_to_cl_mem(self, parrays):
+        """
+        Add pyopencl.array, which are allocated by pyopencl, to self.cl_mem.
+        This should be used before calling allocate_buffers().
+
+        :param parrays: a dictionary of `pyopencl.array.Array` or `pyopencl.Buffer`
+        """
+        mem = self.cl_mem
+        for name, parr in parrays.items():
+            mem[name] = parr
+        self.cl_mem.update(mem)
+
+    def check_workgroup_size(self, kernel_name):
+        "Calculate the maximum workgroup size from given kernel after compilation"
+        return self.kernels.max_workgroup_size(kernel_name)
+
+    def free_buffers(self):
+        """free all device.memory allocated on the device
+        """
+        with self.sem:
+            for key, buf in list(self.cl_mem.items()):
+                if buf is not None:
+                    if isinstance(buf, pyopencl.array.Array):
+                        try:
+                            buf.data.release()
+                        except pyopencl.LogicError:
+                            logger.error("Error while freeing buffer %s", key)
+                    else:
+                        try:
+                            buf.release()
+                        except pyopencl.LogicError:
+                            logger.error("Error while freeing buffer %s", key)
+                    self.cl_mem[key] = None
+
+    def compile_kernels(self, kernel_files=None, compile_options=None):
+        """Call the OpenCL compiler
+
+        :param kernel_files: list of path to the kernel
+            (by default use the one declared in the class)
+        :param compile_options: string of compile options
+        """
+        # concatenate all needed source files into a single openCL module
+        kernel_files = kernel_files or self.kernel_files
+        kernel_src = concatenate_cl_kernel(kernel_files)
+
+        compile_options = compile_options or self.get_compiler_options()
+        logger.info("Compiling file %s with options %s", kernel_files, compile_options)
+        try:
+            self.program = pyopencl.Program(self.ctx, kernel_src).build(options=compile_options)
+        except (pyopencl.MemoryError, pyopencl.LogicError) as error:
+            raise MemoryError(error)
+        else:
+            self.kernels = KernelContainer(self.program)
+
+    def free_kernels(self):
+        """Free all kernels
+        """
+        for kernel in self.cl_kernel_args:
+            self.cl_kernel_args[kernel] = []
+        self.kernels = None
+        self.program = None
+
+    def set_profiling(self, value=True):
+        """Switch On/Off the profiling flag of the command queue to allow debugging
+
+        :param value: set to True to enable profiling, or to False to disable it.
+                      Without profiling, the processing is marginally faster
+
+        Profiling information can then be retrieved with the 'log_profile' method
+        """
+        if bool(value) != self.profile:
+            with self.sem:
+                self.profile = bool(value)
+                if self.queue is not None:
+                    self.queue.finish()
+                if self.profile:
+                    self.queue = pyopencl.CommandQueue(self.ctx,
+                        properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+                else:
+                    self.queue = pyopencl.CommandQueue(self.ctx)
+
+    def profile_add(self, event, desc):
+        """
+        Add an OpenCL event to the events lists, if profiling is enabled.
+
+        :param event: silx.opencl.processing.EventDescription.
+        :param desc: event description
+        """
+        if self.profile:
+            self.events.append(EventDescription(desc, event))
+
+    def allocate_texture(self, shape, hostbuf=None, support_1D=False):
+        return allocate_texture(self.ctx, shape, hostbuf=hostbuf, support_1D=support_1D)
+
+    def transfer_to_texture(self, arr, tex_ref):
+        """
+        Transfer an array to a texture.
+
+        :param arr: Input array. Can be a numpy array or a pyopencl array.
+        :param tex_ref: texture reference (pyopencl._cl.Image).
+        """
+        copy_args = [self.queue, tex_ref, arr]
+        shp = arr.shape
+        ndim = arr.ndim
+        if ndim == 1:
+            # pyopencl and OpenCL < 1.2 do not support image1d_t
+            # force 2D with one row in this case
+            # ~ ndim = 2
+            shp = (1,) + shp
+        copy_kwargs = {"origin":(0,) * ndim, "region": shp[::-1]}
+        if not(isinstance(arr, numpy.ndarray)):  # assuming pyopencl.array.Array
+            # D->D copy
+            copy_args[2] = arr.data
+            copy_kwargs["offset"] = 0
+        ev = pyopencl.enqueue_copy(*copy_args, **copy_kwargs)
+        self.profile_add(ev, "Transfer to texture")
+
+    def log_profile(self, stats=False):
+        """If we are in profiling mode, prints out all timing for every single OpenCL call
+        
+        :param stats: if True, prints the statistics on each kernel instead of all execution timings
+        :return: list of lines to print
+        """
+        total_time = 0.0
+        out = [""]
+        if stats:
+            stats = OrderedDict()
+            out.append(f"OpenCL kernel profiling statistics in milliseconds for: {self.__class__.__name__}")
+            out.append(f"{'Kernel name':>50} (count):      min   median      max     mean      std")
+        else:
+            stats = None
+            out.append(f"Profiling info for OpenCL: {self.__class__.__name__}")
+
+        if self.profile:
+            for e in self.events:
+                if "__len__" in dir(e) and len(e) >= 2:
+                    name = e[0]
+                    pr = e[1].profile
+                    t0 = pr.start
+                    t1 = pr.end
+                    et = 1e-6 * (t1 - t0)
+                    total_time += et
+                    if stats is None:
+                        out.append(f"{name:>50}        : {et:.3f}ms")
+                    else:
+                        if name in stats:
+                            stats[name].append(et)
+                        else:
+                            stats[name] = [et]
+            if stats is not None:
+                for k, v in stats.items():
+                    n = numpy.array(v)
+                    out.append(f"{k:>50} ({len(v):5}): {n.min():8.3f} {numpy.median(n):8.3f} {n.max():8.3f} {n.mean():8.3f} {n.std():8.3f}")
+            out.append("_" * 80)
+            out.append(f"{'Total OpenCL execution time':>50}        : {total_time:.3f}ms")
+
+        logger.info(os.linesep.join(out))
+        return out
+
+    def reset_log(self):
+        """
+        Resets the profiling timers
+        """
+        with self.sem:
+            self.events = []
+
+    @property
+    def x87_volatile_option(self):
+        # this is running 32 bits OpenCL woth POCL
+        if self._X87_VOLATILE is None:
+            if (platform.machine() in ("i386", "i686", "x86_64", "AMD64") and
+                    (tuple.__itemsize__ == 4) and
+                    self.ctx.devices[0].platform.name == 'Portable Computing Language'):
+                self._X87_VOLATILE = "-DX87_VOLATILE=volatile"
+            else:
+                self._X87_VOLATILE = ""
+        return self._X87_VOLATILE
+
+    def get_compiler_options(self, x87_volatile=False):
+        """Provide the default OpenCL compiler options
+
+        :param x87_volatile: needed for Kahan summation
+        :return: string with compiler option
+        """
+        option_list = []
+        if x87_volatile:
+            option_list.append(self.x87_volatile_option)
+        return " ".join(i for i in option_list if i)
+
+# This should be implemented by concrete class
+#     def __copy__(self):
+#         """Shallow copy of the object
+#
+#         :return: copy of the object
+#         """
+#         return self.__class__((self._data, self._indices, self._indptr),
+#                               self.size, block_size=self.BLOCK_SIZE,
+#                               platformid=self.platform.id,
+#                               deviceid=self.device.id,
+#                               checksum=self.on_device.get("data"),
+#                               profile=self.profile, empty=self.empty)
+#
+#     def __deepcopy__(self, memo=None):
+#         """deep copy of the object
+#
+#         :return: deepcopy of the object
+#         """
+#         if memo is None:
+#             memo = {}
+#         new_csr = self._data.copy(), self._indices.copy(), self._indptr.copy()
+#         memo[id(self._data)] = new_csr[0]
+#         memo[id(self._indices)] = new_csr[1]
+#         memo[id(self._indptr)] = new_csr[2]
+#         new_obj = self.__class__(new_csr, self.size,
+#                                  block_size=self.BLOCK_SIZE,
+#                                  platformid=self.platform.id,
+#                                  deviceid=self.device.id,
+#                                  checksum=self.on_device.get("data"),
+#                                  profile=self.profile, empty=self.empty)
+#         memo[id(self)] = new_obj
+#         return new_obj
diff --git a/src/silx/opencl/projection.py b/src/silx/opencl/projection.py
new file mode 100644
index 0000000..c02faf6
--- /dev/null
+++ b/src/silx/opencl/projection.py
@@ -0,0 +1,428 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016-2020 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for tomographic projector on the GPU"""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["A. Mirone, P. Paleo"]
+__license__ = "MIT"
+__date__ = "01/08/2019"
+
+import logging
+import numpy as np
+
+from .common import pyopencl
+from .processing import EventDescription, OpenclProcessing, BufferDescription
+from .backprojection import _sizeof, _idivup
+
+if pyopencl:
+    mf = pyopencl.mem_flags
+    import pyopencl.array as parray
+else:
+    raise ImportError("pyopencl is not installed")
+logger = logging.getLogger(__name__)
+
+
+class Projection(OpenclProcessing):
+    """
+    A class for performing a tomographic projection (Radon Transform) using
+    OpenCL
+    """
+    kernel_files = ["proj.cl", "array_utils.cl"]
+    logger.warning("Forward Projecter is untested and unsuported for now")
+
+    def __init__(self, slice_shape, angles, axis_position=None,
+                 detector_width=None, normalize=False, ctx=None,
+                 devicetype="all", platformid=None, deviceid=None,
+                 profile=False
+                 ):
+        """Constructor of the OpenCL projector.
+
+        :param slice_shape: shape of the slice: (num_rows, num_columns).
+        :param angles: Either an integer number of angles, or a list of custom
+                       angles values in radian.
+        :param axis_position: Optional, axis position. Default is
+                              `(shape[1]-1)/2.0`.
+        :param detector_width: Optional, detector width in pixels.
+                               If detector_width > slice_shape[1], the
+                               projection data will be surrounded with zeros.
+                               Using detector_width < slice_shape[1] might
+                               result in a local tomography setup.
+        :param normalize: Optional, normalization. If set, the sinograms are
+                          multiplied by the factor pi/(2*nprojs).
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by
+                           clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param profile: switch on profiling to be able to profile at the kernel
+                        level, store profiling elements (makes code slightly
+                        slower)
+        """
+        # OS X enforces a workgroup size of 1 when the kernel has synchronization barriers
+        # if sys.platform.startswith('darwin'): # assuming no discrete GPU
+        #    raise NotImplementedError("Backprojection is not implemented on CPU for OS X yet")
+
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  profile=profile)
+        self.shape = slice_shape
+        self.axis_pos = axis_position
+        self.angles = angles
+        self.dwidth = detector_width
+        self.normalize = normalize
+
+        # Default values
+        if self.axis_pos is None:
+            self.axis_pos = (self.shape[1] - 1) / 2.
+        if self.dwidth is None:
+            self.dwidth = self.shape[1]
+        if not(np.iterable(self.angles)):
+            if self.angles is None:
+                self.nprojs = self.shape[0]
+            else:
+                self.nprojs = self.angles
+            self.angles = np.linspace(start=0,
+                                      stop=np.pi,
+                                      num=self.nprojs,
+                                      endpoint=False).astype(dtype=np.float32)
+        else:
+            self.nprojs = len(self.angles)
+        self.offset_x = -np.float32((self.shape[1] - 1) / 2. - self.axis_pos)  # TODO: custom
+        self.offset_y = -np.float32((self.shape[0] - 1) / 2. - self.axis_pos)  # TODO: custom
+        # Reset axis_pos once offset are computed
+        self.axis_pos0 = np.float64((self.shape[1] - 1) / 2.)
+
+        # Workgroup, ndrange and shared size
+        self.dimgrid_x = _idivup(self.dwidth, 16)
+        self.dimgrid_y = _idivup(self.nprojs, 16)
+        self._dimrecx = np.int32(self.dimgrid_x * 16)
+        self._dimrecy = np.int32(self.dimgrid_y * 16)
+        self.local_mem = 16 * 7 * _sizeof(np.float32)
+        self.wg = (16, 16)
+        self.ndrange = (
+            int(self.dimgrid_x) * self.wg[0],  # int(): pyopencl <= 2015.1
+            int(self.dimgrid_y) * self.wg[1]  # int(): pyopencl <= 2015.1
+        )
+
+        self._use_textures = self.check_textures_availability()
+
+        # Allocate memory
+        self.buffers = [
+            BufferDescription("_d_sino", self._dimrecx * self._dimrecy, np.float32, mf.READ_WRITE),
+            BufferDescription("d_angles", self._dimrecy, np.float32, mf.READ_ONLY),
+            BufferDescription("d_beginPos", self._dimrecy * 2, np.int32, mf.READ_ONLY),
+            BufferDescription("d_strideJoseph", self._dimrecy * 2, np.int32, mf.READ_ONLY),
+            BufferDescription("d_strideLine", self._dimrecy * 2, np.int32, mf.READ_ONLY),
+        ]
+        d_axis_corrections = parray.empty(self.queue, self.nprojs, np.float32)
+        d_axis_corrections.fill(np.float32(0.0))
+        self.add_to_cl_mem(
+            {
+                "d_axis_corrections": d_axis_corrections
+            }
+        )
+        self._tmp_extended_img = np.zeros((self.shape[0] + 2, self.shape[1] + 2),
+                                          dtype=np.float32)
+        if not(self._use_textures):
+            self.allocate_slice()
+        else:
+            self.allocate_textures()
+        self.allocate_buffers()
+        self._ex_sino = np.zeros((self._dimrecy, self._dimrecx),
+                                 dtype=np.float32)
+        if not(self._use_textures):
+            self.cl_mem["d_slice"].fill(0.)
+            # enqueue_fill_buffer has issues if opencl 1.2 is not present
+            # ~ pyopencl.enqueue_fill_buffer(
+                # ~ self.queue,
+                # ~ self.cl_mem["d_slice"],
+                # ~ np.float32(0),
+                # ~ 0,
+                # ~ self._tmp_extended_img.size * _sizeof(np.float32)
+            # ~ )
+        # Precomputations
+        self.compute_angles()
+        self.proj_precomputations()
+        self.cl_mem["d_axis_corrections"].fill(0.)
+        # enqueue_fill_buffer has issues if opencl 1.2 is not present
+        # ~ pyopencl.enqueue_fill_buffer(
+                                    # ~ self.queue,
+                                    # ~ self.cl_mem["d_axis_corrections"],
+                                    # ~ np.float32(0),
+                                    # ~ 0,
+                                    # ~ self.nprojs*_sizeof(np.float32)
+                                    # ~ )
+        # Shorthands
+        self._d_sino = self.cl_mem["_d_sino"]
+
+        compile_options = None
+        if not(self._use_textures):
+            compile_options = "-DDONT_USE_TEXTURES"
+        OpenclProcessing.compile_kernels(
+            self,
+            self.kernel_files,
+            compile_options=compile_options
+        )
+        # check that workgroup can actually be (16, 16)
+        self.compiletime_workgroup_size = self.kernels.max_workgroup_size("forward_kernel_cpu")
+
+    def compute_angles(self):
+        angles2 = np.zeros(self._dimrecy, dtype=np.float32)  # dimrecy != num_projs
+        angles2[:self.nprojs] = np.copy(self.angles)
+        angles2[self.nprojs:] = angles2[self.nprojs - 1]
+        self.angles2 = angles2
+        pyopencl.enqueue_copy(self.queue, self.cl_mem["d_angles"], angles2)
+
+    def allocate_slice(self):
+        ary = parray.empty(self.queue, (self.shape[1] + 2, self.shape[1] + 2), np.float32)
+        ary.fill(0)
+        self.add_to_cl_mem({"d_slice": ary})
+
+    def allocate_textures(self):
+        self.d_image_tex = pyopencl.Image(
+                self.ctx,
+                mf.READ_ONLY | mf.USE_HOST_PTR,
+                pyopencl.ImageFormat(
+                    pyopencl.channel_order.INTENSITY,
+                    pyopencl.channel_type.FLOAT
+                ), hostbuf=np.ascontiguousarray(self._tmp_extended_img.T),
+            )
+
+    def transfer_to_texture(self, image):
+        image2 = image
+        if not(image.flags["C_CONTIGUOUS"] and image.dtype == np.float32):
+            image2 = np.ascontiguousarray(image)
+        if not(self._use_textures):
+            # TODO: create NoneEvent
+            return self.transfer_to_slice(image2)
+            # ~ return pyopencl.enqueue_copy(
+                        # ~ self.queue,
+                        # ~ self.cl_mem["d_slice"].data,
+                        # ~ image2,
+                        # ~ origin=(1, 1),
+                        # ~ region=image.shape[::-1]
+                        # ~ )
+        else:
+            return pyopencl.enqueue_copy(
+                       self.queue,
+                       self.d_image_tex,
+                       image2,
+                       origin=(1, 1),
+                       region=image.shape[::-1]
+                   )
+
+    def transfer_device_to_texture(self, d_image):
+        if not(self._use_textures):
+            # TODO this copy should not be necessary
+            return self.cpy2d_to_slice(d_image)
+        else:
+            return pyopencl.enqueue_copy(
+                self.queue,
+                self.d_image_tex,
+                d_image,
+                offset=0,
+                origin=(1, 1),
+                region=(int(self.shape[1]), int(self.shape[0]))  # self.shape[::-1] # pyopencl <= 2015.2
+            )
+
+    def transfer_to_slice(self, image):
+        image2 = np.zeros((image.shape[0] + 2, image.shape[1] + 2), dtype=np.float32)
+        image2[1:-1, 1:-1] = image.astype(np.float32)
+        self.cl_mem["d_slice"].set(image2)
+
+    def proj_precomputations(self):
+        beginPos = np.zeros((2, self._dimrecy), dtype=np.int32)
+        strideJoseph = np.zeros((2, self._dimrecy), dtype=np.int32)
+        strideLine = np.zeros((2, self._dimrecy), dtype=np.int32)
+        cos_angles = np.cos(self.angles2)
+        sin_angles = np.sin(self.angles2)
+        dimslice = self.shape[1]
+
+        M1 = np.abs(cos_angles) > 0.70710678
+        M1b = np.logical_not(M1)
+        M2 = cos_angles > 0
+        M2b = np.logical_not(M2)
+        M3 = sin_angles > 0
+        M3b = np.logical_not(M3)
+        case1 = M1 * M2
+        case2 = M1 * M2b
+        case3 = M1b * M3
+        case4 = M1b * M3b
+
+        beginPos[0][case1] = 0
+        beginPos[1][case1] = 0
+        strideJoseph[0][case1] = 1
+        strideJoseph[1][case1] = 0
+        strideLine[0][case1] = 0
+        strideLine[1][case1] = 1
+
+        beginPos[0][case2] = dimslice - 1
+        beginPos[1][case2] = dimslice - 1
+        strideJoseph[0][case2] = -1
+        strideJoseph[1][case2] = 0
+        strideLine[0][case2] = 0
+        strideLine[1][case2] = -1
+
+        beginPos[0][case3] = dimslice - 1
+        beginPos[1][case3] = 0
+        strideJoseph[0][case3] = 0
+        strideJoseph[1][case3] = 1
+        strideLine[0][case3] = -1
+        strideLine[1][case3] = 0
+
+        beginPos[0][case4] = 0
+        beginPos[1][case4] = dimslice - 1
+        strideJoseph[0][case4] = 0
+        strideJoseph[1][case4] = -1
+        strideLine[0][case4] = 1
+        strideLine[1][case4] = 0
+
+        # For debug purpose
+        # ~ self.beginPos = beginPos
+        # ~ self.strideJoseph = strideJoseph
+        # ~ self.strideLine = strideLine
+        #
+
+        pyopencl.enqueue_copy(self.queue, self.cl_mem["d_beginPos"], beginPos)
+        pyopencl.enqueue_copy(self.queue, self.cl_mem["d_strideJoseph"], strideJoseph)
+        pyopencl.enqueue_copy(self.queue, self.cl_mem["d_strideLine"], strideLine)
+
+    def _get_local_mem(self):
+        return pyopencl.LocalMemory(self.local_mem)  # constant for all image sizes
+
+    def cpy2d_to_sino(self, dst):
+        ndrange = (int(self.dwidth), int(self.nprojs))  # pyopencl < 2015.2
+        sino_shape_ocl = np.int32(ndrange)
+        wg = None
+        kernel_args = (
+            dst.data,
+            self._d_sino,
+            np.int32(self.dwidth),
+            np.int32(self._dimrecx),
+            np.int32((0, 0)),
+            np.int32((0, 0)),
+            sino_shape_ocl
+        )
+        return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args)
+
+    def cpy2d_to_slice(self, src):
+        """
+        copy a Nx * Ny slice to self.d_slice which is (Nx+2)*(Ny+2)
+        """
+        ndrange = (int(self.shape[1]), int(self.shape[0]))  # self.shape[::-1] # pyopencl < 2015.2
+        wg = None
+        slice_shape_ocl = np.int32(ndrange)
+        kernel_args = (
+            self.cl_mem["d_slice"].data,
+            src,
+            np.int32(self.shape[1] + 2),
+            np.int32(self.shape[1]),
+            np.int32((1, 1)),
+            np.int32((0, 0)),
+            slice_shape_ocl
+        )
+        return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args)
+
+    def projection(self, image=None, dst=None):
+        """Perform the projection on an input image
+
+        :param image: Image to project
+        :return: A sinogram
+        """
+        events = []
+        with self.sem:
+            if image is not None:
+                assert image.ndim == 2, "Treat only 2D images"
+                assert image.shape[0] == self.shape[0], "image shape is OK"
+                assert image.shape[1] == self.shape[1], "image shape is OK"
+                if self._use_textures:
+                    self.transfer_to_texture(image)
+                    slice_ref = self.d_image_tex
+                else:
+                    self.transfer_to_slice(image)
+                    slice_ref = self.cl_mem["d_slice"].data
+            else:
+                if not(self._use_textures):
+                    slice_ref = self.cl_mem["d_slice"].data
+                else:
+                    slice_ref = self.d_image_tex
+
+            kernel_args = (
+                self._d_sino,
+                slice_ref,
+                np.int32(self.shape[1]),
+                np.int32(self.dwidth),
+                self.cl_mem["d_angles"],
+                np.float32(self.axis_pos0),
+                self.cl_mem["d_axis_corrections"].data,  # TODO custom
+                self.cl_mem["d_beginPos"],
+                self.cl_mem["d_strideJoseph"],
+                self.cl_mem["d_strideLine"],
+                np.int32(self.nprojs),
+                self._dimrecx,
+                self._dimrecy,
+                self.offset_x,
+                self.offset_y,
+                np.int32(1),  # josephnoclip, 1 by default
+                np.int32(self.normalize)
+            )
+
+            # Call the kernel
+            if not(self._use_textures):
+                event_pj = self.kernels.forward_kernel_cpu(
+                    self.queue,
+                    self.ndrange,
+                    self.wg,
+                    *kernel_args
+                )
+            else:
+                event_pj = self.kernels.forward_kernel(
+                    self.queue,
+                    self.ndrange,
+                    self.wg,
+                    *kernel_args
+                )
+            events.append(EventDescription("projection", event_pj))
+            if dst is None:
+                self._ex_sino[:] = 0
+                ev = pyopencl.enqueue_copy(self.queue, self._ex_sino, self._d_sino)
+                events.append(EventDescription("copy D->H result", ev))
+                ev.wait()
+                res = np.copy(self._ex_sino[:self.nprojs, :self.dwidth])
+            else:
+                ev = self.cpy2d_to_sino(dst)
+                events.append(EventDescription("copy D->D result", ev))
+                ev.wait()
+                res = dst
+        # /with self.sem
+        if self.profile:
+            self.events += events
+        # ~ res = self._ex_sino
+        return res
+
+    __call__ = projection
diff --git a/src/silx/opencl/reconstruction.py b/src/silx/opencl/reconstruction.py
new file mode 100644
index 0000000..2c84aee
--- /dev/null
+++ b/src/silx/opencl/reconstruction.py
@@ -0,0 +1,388 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for tomographic reconstruction algorithms"""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["P. Paleo"]
+__license__ = "MIT"
+__date__ = "01/08/2019"
+
+import logging
+import numpy as np
+
+from .common import pyopencl
+from .processing import OpenclProcessing
+from .backprojection import Backprojection
+from .projection import Projection
+from .linalg import LinAlg
+
+import pyopencl.array as parray
+from pyopencl.elementwise import ElementwiseKernel
+logger = logging.getLogger(__name__)
+
+cl = pyopencl
+
+
+class ReconstructionAlgorithm(OpenclProcessing):
+    """
+    A parent class for all iterative tomographic reconstruction algorithms
+
+    :param sino_shape: shape of the sinogram. The sinogram is in the format
+                       (n_b, n_a) where n_b is the number of detector bins and
+                       n_a is the number of angles.
+    :param slice_shape: Optional, shape of the reconstructed slice.
+                        By default, it is a square slice where the dimension
+                        is the "x dimension" of the sinogram (number of bins).
+    :param axis_position: Optional, axis position. Default is `(shape[1]-1)/2.0`.
+    :param angles: Optional, a list of custom angles in radian.
+    :param ctx: actual working context, left to None for automatic
+                initialization from device type or platformid/deviceid
+    :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+    :param platformid: integer with the platform_identifier, as given by clinfo
+    :param deviceid: Integer with the device identifier, as given by clinfo
+    :param profile: switch on profiling to be able to profile at the kernel level,
+                    store profiling elements (makes code slightly slower)
+    """
+
+    def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None,
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 profile=False
+                 ):
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  profile=profile)
+
+        # Create a backprojector
+        self.backprojector = Backprojection(
+            sino_shape,
+            slice_shape=slice_shape,
+            axis_position=axis_position,
+            angles=angles,
+            ctx=self.ctx,
+            profile=profile
+        )
+        # Create a projector
+        self.projector = Projection(
+            self.backprojector.slice_shape,
+            self.backprojector.angles,
+            axis_position=axis_position,
+            detector_width=self.backprojector.num_bins,
+            normalize=False,
+            ctx=self.ctx,
+            profile=profile
+        )
+        self.sino_shape = sino_shape
+        self.is_cpu = self.backprojector.is_cpu
+        # Arrays
+        self.d_data = parray.empty(self.queue, sino_shape, dtype=np.float32)
+        self.d_data.fill(0.0)
+        self.d_sino = parray.empty_like(self.d_data)
+        self.d_sino.fill(0.0)
+        self.d_x = parray.empty(self.queue,
+                                self.backprojector.slice_shape,
+                                dtype=np.float32)
+        self.d_x.fill(0.0)
+        self.d_x_old = parray.empty_like(self.d_x)
+        self.d_x_old.fill(0.0)
+
+        self.add_to_cl_mem({
+                            "d_data": self.d_data,
+                            "d_sino": self.d_sino,
+                            "d_x": self.d_x,
+                            "d_x_old": self.d_x_old,
+                            })
+
+    def proj(self, d_slice, d_sino):
+        """
+        Project d_slice to d_sino
+        """
+        self.projector.transfer_device_to_texture(d_slice.data)  #.wait()
+        self.projector.projection(dst=d_sino)
+
+    def backproj(self, d_sino, d_slice):
+        """
+        Backproject d_sino to d_slice
+        """
+        self.backprojector.transfer_device_to_texture(d_sino.data)  #.wait()
+        self.backprojector.backprojection(dst=d_slice)
+
+
+class SIRT(ReconstructionAlgorithm):
+    """
+    A class for the SIRT algorithm
+
+    :param sino_shape: shape of the sinogram. The sinogram is in the format
+                       (n_b, n_a) where n_b is the number of detector bins and
+                       n_a is the number of angles.
+    :param slice_shape: Optional, shape of the reconstructed slice.
+                        By default, it is a square slice where the dimension is
+                        the "x dimension" of the sinogram (number of bins).
+    :param axis_position: Optional, axis position. Default is `(shape[1]-1)/2.0`.
+    :param angles: Optional, a list of custom angles in radian.
+    :param ctx: actual working context, left to None for automatic
+                initialization from device type or platformid/deviceid
+    :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+    :param platformid: integer with the platform_identifier, as given by clinfo
+    :param deviceid: Integer with the device identifier, as given by clinfo
+    :param profile: switch on profiling to be able to profile at the kernel level,
+                    store profiling elements (makes code slightly slower)
+
+    .. warning:: This is a beta version of the SIRT algorithm. Reconstruction
+            fails for at least on CPU (Xeon E3-1245 v5) using the AMD opencl
+            implementation.
+    """
+
+    def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None,
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 profile=False
+                 ):
+
+        ReconstructionAlgorithm.__init__(self, sino_shape, slice_shape=slice_shape,
+                                         axis_position=axis_position, angles=angles,
+                                         ctx=ctx, devicetype=devicetype, platformid=platformid,
+                                         deviceid=deviceid, profile=profile)
+        self.compute_preconditioners()
+
+    def compute_preconditioners(self):
+        """
+        Create a diagonal preconditioner for the projection and backprojection
+        operator.
+        Each term of the diagonal is the sum of the projector/backprojector
+        along rows [1], i.e the projection/backprojection of an array of ones.
+
+        [1] Jens Gregor and Thomas Benson,
+            Computational Analysis and Improvement of SIRT,
+            IEEE transactions on medical imaging, vol. 27, no. 7,  2008
+        """
+
+        # r_{i,i} = 1/(sum_j a_{i,j})
+        slice_ones = np.ones(self.backprojector.slice_shape, dtype=np.float32)
+        R = 1./self.projector.projection(slice_ones)  # could be all done on GPU, but I want extra checks
+        R[np.logical_not(np.isfinite(R))] = 1.  # In the case where the rotation axis is excentred
+        self.d_R = parray.to_device(self.queue, R)
+        # c_{j,j} = 1/(sum_i a_{i,j})
+        sino_ones = np.ones(self.sino_shape, dtype=np.float32)
+        C = 1./self.backprojector.backprojection(sino_ones)
+        C[np.logical_not(np.isfinite(C))] = 1.  # In the case where the rotation axis is excentred
+        self.d_C = parray.to_device(self.queue, C)
+
+        self.add_to_cl_mem({
+            "d_R": self.d_R,
+            "d_C": self.d_C
+        })
+
+    # TODO: compute and possibly return the residual
+    def run(self, data, n_it):
+        """
+        Run n_it iterations of the SIRT algorithm.
+        """
+        cl.enqueue_copy(self.queue, self.d_data.data, np.ascontiguousarray(data.astype(np.float32)))
+
+        d_x_old = self.d_x_old
+        d_x = self.d_x
+        d_R = self.d_R
+        d_C = self.d_C
+        d_sino = self.d_sino
+        d_x *= 0
+
+        for k in range(n_it):
+            d_x_old[:] = d_x[:]
+            # x{k+1} = x{k} - C A^T R (A x{k} - b)
+            self.proj(d_x, d_sino)
+            d_sino -= self.d_data
+            d_sino *= d_R
+            if self.is_cpu:
+                # This sync is necessary when using CPU, while it is not for GPU
+                d_sino.finish()
+            self.backproj(d_sino, d_x)
+            d_x *= -d_C
+            d_x += d_x_old
+            if self.is_cpu:
+                # This sync is necessary when using CPU, while it is not for GPU
+                d_x.finish()
+
+        return d_x
+
+    __call__ = run
+
+
+class TV(ReconstructionAlgorithm):
+    """
+    A class for reconstruction with Total Variation regularization using the
+    Chambolle-Pock TV reconstruction algorithm.
+
+    :param sino_shape: shape of the sinogram. The sinogram is in the format
+                       (n_b, n_a) where n_b is the number of detector bins and
+                       n_a is the number of angles.
+    :param slice_shape: Optional, shape of the reconstructed slice. By default,
+                        it is a square slice where the dimension is the
+                        "x dimension" of the sinogram (number of bins).
+    :param axis_position: Optional, axis position. Default is
+                          `(shape[1]-1)/2.0`.
+    :param angles: Optional, a list of custom angles in radian.
+    :param ctx: actual working context, left to None for automatic
+                initialization from device type or platformid/deviceid
+    :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+    :param platformid: integer with the platform_identifier, as given by clinfo
+    :param deviceid: Integer with the device identifier, as given by clinfo
+    :param profile: switch on profiling to be able to profile at the kernel
+                    level, store profiling elements (makes code slightly slower)
+
+    .. warning:: This is a beta version of the Chambolle-Pock TV algorithm.
+            Reconstruction fails for at least on CPU (Xeon E3-1245 v5) using
+            the AMD opencl implementation.
+    """
+
+    def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None,
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 profile=False
+                 ):
+        ReconstructionAlgorithm.__init__(self, sino_shape, slice_shape=slice_shape,
+                                         axis_position=axis_position, angles=angles,
+                                         ctx=ctx, devicetype=devicetype, platformid=platformid,
+                                         deviceid=deviceid, profile=profile)
+        self.compute_preconditioners()
+
+        # Create a LinAlg instance
+        self.linalg = LinAlg(self.backprojector.slice_shape, ctx=self.ctx)
+        # Positivity constraint
+        self.elwise_clamp = ElementwiseKernel(self.ctx, "float *a", "a[i] = max(a[i], 0.0f);")
+        # Projection onto the L-infinity ball of radius Lambda
+        self.elwise_proj_linf = ElementwiseKernel(
+            self.ctx,
+            "float2* a, float Lambda",
+            "a[i].x = copysign(min(fabs(a[i].x), Lambda), a[i].x); a[i].y = copysign(min(fabs(a[i].y), Lambda), a[i].y);",
+            "elwise_proj_linf"
+        )
+        # Additional arrays
+        self.linalg.gradient(self.d_x)
+        self.d_p = parray.empty_like(self.linalg.cl_mem["d_gradient"])
+        self.d_q = parray.empty_like(self.d_data)
+        self.d_g = self.linalg.d_image
+        self.d_tmp = parray.empty_like(self.d_x)
+        self.d_p.fill(0)
+        self.d_q.fill(0)
+        self.d_tmp.fill(0)
+        self.add_to_cl_mem({
+            "d_p": self.d_p,
+            "d_q": self.d_q,
+            "d_tmp": self.d_tmp,
+        })
+
+        self.theta = 1.0
+
+    def compute_preconditioners(self):
+        """
+        Create a diagonal preconditioner for the projection and backprojection
+        operator.
+        Each term of the diagonal is the sum of the projector/backprojector
+        along rows [2],
+        i.e the projection/backprojection of an array of ones.
+
+        [2] T. Pock, A. Chambolle,
+            Diagonal preconditioning for first order primal-dual algorithms in
+            convex optimization,
+            International Conference on Computer Vision, 2011
+        """
+
+        # Compute the diagonal preconditioner "Sigma"
+        slice_ones = np.ones(self.backprojector.slice_shape, dtype=np.float32)
+        Sigma_k = 1./self.projector.projection(slice_ones)
+        Sigma_k[np.logical_not(np.isfinite(Sigma_k))] = 1.
+        self.d_Sigma_k = parray.to_device(self.queue, Sigma_k)
+        self.d_Sigma_kp1 = self.d_Sigma_k + 1  # TODO: memory vs computation
+        self.Sigma_grad = 1/2.0  # For discrete gradient, sum|D_i,j| = 2 along lines or cols
+
+        # Compute the diagonal preconditioner "Tau"
+        sino_ones = np.ones(self.sino_shape, dtype=np.float32)
+        C = self.backprojector.backprojection(sino_ones)
+        Tau = 1./(C + 2.)
+        self.d_Tau = parray.to_device(self.queue, Tau)
+
+        self.add_to_cl_mem({
+            "d_Sigma_k": self.d_Sigma_k,
+            "d_Sigma_kp1": self.d_Sigma_kp1,
+            "d_Tau": self.d_Tau
+        })
+
+    def run(self, data, n_it, Lambda, pos_constraint=False):
+        """
+        Run n_it iterations of the TV-regularized reconstruction,
+        with the regularization parameter Lambda.
+        """
+        cl.enqueue_copy(self.queue, self.d_data.data, np.ascontiguousarray(data.astype(np.float32)))
+
+        d_x = self.d_x
+        d_x_old = self.d_x_old
+        d_tmp = self.d_tmp
+        d_sino = self.d_sino
+        d_p = self.d_p
+        d_q = self.d_q
+        d_g = self.d_g
+
+        d_x *= 0
+        d_p *= 0
+        d_q *= 0
+
+        for k in range(0, n_it):
+            # Update primal variables
+            d_x_old[:] = d_x[:]
+            #~ x = x + Tau*div(p) - Tau*Kadj(q)
+            self.backproj(d_q, d_tmp)
+            self.linalg.divergence(d_p)
+            # TODO: this in less than three ops (one kernel ?)
+            d_g -= d_tmp  # d_g -> L.d_image
+            d_g *= self.d_Tau
+            d_x += d_g
+
+            if pos_constraint:
+                self.elwise_clamp(d_x)
+
+            # Update dual variables
+            #~ p = proj_linf(p + Sigma_grad*gradient(x + theta*(x - x_old)), Lambda)
+            d_tmp[:] = d_x[:]
+            # FIXME: mul_add is out of place, put an equivalent thing in linalg...
+            #~ d_tmp.mul_add(1 + theta, d_x_old, -theta)
+            d_tmp *= 1+self.theta
+            d_tmp -= self.theta*d_x_old
+            self.linalg.gradient(d_tmp)
+            # TODO: out of place mul_add
+            #~ d_p.mul_add(1, L.cl_mem["d_gradient"], Sigma_grad)
+            self.linalg.cl_mem["d_gradient"] *= self.Sigma_grad
+            d_p += self.linalg.cl_mem["d_gradient"]
+            self.elwise_proj_linf(d_p, Lambda)
+
+            #~ q = (q + Sigma_k*K(x + theta*(x - x_old)) - Sigma_k*data)/(1.0 + Sigma_k)
+            self.proj(d_tmp, d_sino)
+            # TODO: this in less instructions
+            d_sino -= self.d_data
+            d_sino *= self.d_Sigma_k
+            d_q += d_sino
+            d_q /= self.d_Sigma_kp1
+        return d_x
+
+    __call__ = run
diff --git a/src/silx/opencl/setup.py b/src/silx/opencl/setup.py
new file mode 100644
index 0000000..10fb1be
--- /dev/null
+++ b/src/silx/opencl/setup.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+#
+# Copyright (C) 2016-2017 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+
+from __future__ import division
+
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__authors__ = ["J. Kieffer"]
+__date__ = "16/10/2017"
+
+import os.path
+from numpy.distutils.misc_util import Configuration
+
+
+def configuration(parent_package='', top_path=None):
+    config = Configuration('opencl', parent_package, top_path)
+    path = os.path.dirname(os.path.abspath(__file__))
+    if os.path.exists(os.path.join(path, 'sift')):
+        config.add_subpackage('sift')
+    config.add_subpackage('codec')
+    config.add_subpackage('test')
+    return config
+
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/src/silx/opencl/sinofilter.py b/src/silx/opencl/sinofilter.py
new file mode 100644
index 0000000..d608744
--- /dev/null
+++ b/src/silx/opencl/sinofilter.py
@@ -0,0 +1,435 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016-2019 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for sinogram filtering on CPU/GPU."""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["P. Paleo"]
+__license__ = "MIT"
+__date__ = "07/06/2019"
+
+import numpy as np
+from math import pi
+
+
+import pyopencl.array as parray
+from .common import pyopencl as cl
+from .processing import OpenclProcessing
+from ..math.fft.clfft import CLFFT, __have_clfft__
+from ..math.fft.npfft import NPFFT
+from ..image.tomography import generate_powers, get_next_power, compute_fourier_filter
+from ..utils.deprecation import deprecated
+
+
+
+class SinoFilter(OpenclProcessing):
+    """A class for performing sinogram filtering on GPU using OpenCL.
+
+    This is a convolution in the Fourier space, along one dimension:
+
+    - In 2D: (n_a, d_x): n_a filterings (1D FFT of size d_x)
+    - In 3D: (n_z, n_a, d_x): n_z*n_a filterings (1D FFT of size d_x)
+    """
+    kernel_files = ["array_utils.cl"]
+    powers = generate_powers()
+
+    def __init__(self, sino_shape, filter_name=None, ctx=None,
+                 devicetype="all", platformid=None, deviceid=None,
+                 profile=False, extra_options=None):
+        """Constructor of OpenCL FFT-Convolve.
+
+        :param sino_shape: shape of the sinogram.
+        :param filter_name: Name of the filter. Defaut is "ram-lak".
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by
+                           clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param profile: switch on profiling to be able to profile at the kernel
+                        level, store profiling elements (makes code slightly
+                        slower)
+        :param dict extra_options: Advanced extra options.
+            Current options are: cutoff, use_numpy_fft
+        """
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  profile=profile)
+
+        self._init_extra_options(extra_options)
+        self._calculate_shapes(sino_shape)
+        self._init_fft()
+        self._allocate_memory()
+        self._compute_filter(filter_name)
+        self._init_kernels()
+
+    def _calculate_shapes(self, sino_shape):
+        """
+
+        :param sino_shape: shape of the sinogram.
+        """
+        self.ndim = len(sino_shape)
+        if self.ndim == 2:
+            n_angles, dwidth = sino_shape
+        else:
+            raise ValueError("Invalid sinogram number of dimensions: "
+                             "expected 2 dimensions")
+        self.sino_shape = sino_shape
+        self.n_angles = n_angles
+        self.dwidth = dwidth
+        self.dwidth_padded = get_next_power(2 * self.dwidth, powers=self.powers)
+        self.sino_padded_shape = (n_angles, self.dwidth_padded)
+        sino_f_shape = list(self.sino_padded_shape)
+        sino_f_shape[-1] = sino_f_shape[-1] // 2 + 1
+        self.sino_f_shape = tuple(sino_f_shape)
+
+    def _init_extra_options(self, extra_options):
+        """
+
+        :param dict extra_options: Advanced extra options.
+            Current options are: cutoff,
+        """
+        self.extra_options = {
+            "cutoff": 1.,
+            "use_numpy_fft": False,
+        }
+        if extra_options is not None:
+            self.extra_options.update(extra_options)
+
+    def _init_fft(self):
+        if __have_clfft__ and not(self.extra_options["use_numpy_fft"]):
+            self.fft_backend = "opencl"
+            self.fft = CLFFT(
+                self.sino_padded_shape,
+                dtype=np.float32,
+                axes=(-1,),
+                ctx=self.ctx,
+            )
+        else:
+            self.fft_backend = "numpy"
+            print("The gpyfft module was not found. The Fourier transforms "
+                  "will be done on CPU. For more performances, it is advised "
+                  "to install gpyfft.""")
+            self.fft = NPFFT(
+                template=np.zeros(self.sino_padded_shape, "f"),
+                axes=(-1,),
+            )
+
+    def _allocate_memory(self):
+        self.d_filter_f = parray.zeros(self.queue, (self.sino_f_shape[-1],), np.complex64)
+        self.is_cpu = (self.device.type == "CPU")
+        # These are already allocated by FFT() if using the opencl backend
+        if self.fft_backend == "opencl":
+            self.d_sino_padded = self.fft.data_in
+            self.d_sino_f = self.fft.data_out
+        else:
+            # When using the numpy backend, arrays are not pre-allocated
+            self.d_sino_padded = np.zeros(self.sino_padded_shape, "f")
+            self.d_sino_f = np.zeros(self.sino_f_shape, np.complex64)
+        # These are needed for rectangular memcpy in certain cases (see below).
+        self.tmp_sino_device = parray.zeros(self.queue, self.sino_shape, "f")
+        self.tmp_sino_host = np.zeros(self.sino_shape, "f")
+
+    def _compute_filter(self, filter_name):
+        """
+
+        :param str filter_name: filter name
+        """
+        self.filter_name = filter_name or "ram-lak"
+        filter_f = compute_fourier_filter(
+            self.dwidth_padded,
+            self.filter_name,
+            cutoff=self.extra_options["cutoff"],
+        )[:self.dwidth_padded // 2 + 1]  # R2C
+        self.set_filter(filter_f, normalize=True)
+
+    def set_filter(self, h_filt, normalize=True):
+        """
+        Set a filter for sinogram filtering.
+
+        :param h_filt: Filter. Each line of the sinogram will be filtered with
+            this filter. It has to be the Real-to-Complex Fourier Transform
+            of some real filter, padded to 2*sinogram_width.
+        :param normalize: Whether to normalize the filter with pi/num_angles.
+        """
+        if h_filt.size != self.sino_f_shape[-1]:
+            raise ValueError(
+                """
+                Invalid filter size: expected %d, got %d.
+                Please check that the filter is the Fourier R2C transform of
+                some real 1D filter.
+                """
+                % (self.sino_f_shape[-1], h_filt.size)
+            )
+        if not(np.iscomplexobj(h_filt)):
+            print("Warning: expected a complex Fourier filter")
+        self.filter_f = h_filt
+        if normalize:
+            self.filter_f *= pi / self.n_angles
+        self.filter_f = self.filter_f.astype(np.complex64)
+        self.d_filter_f[:] = self.filter_f[:]
+
+    def _init_kernels(self):
+        OpenclProcessing.compile_kernels(self, self.kernel_files)
+        h, w = self.d_sino_f.shape
+        self.mult_kern_args = (self.queue, (int(w), (int(h))), None,
+                               self.d_sino_f.data,
+                               self.d_filter_f.data,
+                               np.int32(w),
+                               np.int32(h))
+
+    def check_array(self, arr):
+        if arr.dtype != np.float32:
+            raise ValueError("Expected data type = numpy.float32")
+        if arr.shape != self.sino_shape:
+            raise ValueError("Expected sinogram shape %s, got %s" %
+                             (self.sino_shape, arr.shape))
+        if not(isinstance(arr, np.ndarray) or isinstance(arr, parray.Array)):
+            raise ValueError("Expected either numpy.ndarray or "
+                             "pyopencl.array.Array")
+
+    def copy2d(self, dst, src, transfer_shape, dst_offset=(0, 0),
+               src_offset=(0, 0)):
+        """
+
+        :param dst:
+        :param src:
+        :param transfer_shape:
+        :param dst_offset:
+        :param src_offset:
+        """
+        shape = tuple(int(i) for i in transfer_shape[::-1])
+        ev = self.kernels.cpy2d(self.queue, shape, None,
+                                dst.data,
+                                src.data,
+                                np.int32(dst.shape[1]),
+                                np.int32(src.shape[1]),
+                                np.int32(dst_offset),
+                                np.int32(src_offset),
+                                np.int32(transfer_shape[::-1]))
+        ev.wait()
+
+    def copy2d_host(self, dst, src, transfer_shape, dst_offset=(0, 0),
+                    src_offset=(0, 0)):
+        """
+
+        :param dst:
+        :param src:
+        :param transfer_shape:
+        :param dst_offset:
+        :param src_offset:
+        """
+        s = transfer_shape
+        do = dst_offset
+        so = src_offset
+        dst[do[0]:do[0] + s[0], do[1]:do[1] + s[1]] = src[so[0]:so[0] + s[0], so[1]:so[1] + s[1]]
+
+    def _prepare_input_sino(self, sino):
+        """
+        :param sino: sinogram
+        """
+        self.check_array(sino)
+        self.d_sino_padded.fill(0)
+        if self.fft_backend == "opencl":
+            # OpenCL backend: FFT/mult/IFFT are done on device.
+            if isinstance(sino, np.ndarray):
+                # OpenCL backend + numpy input: copy H->D.
+                # As pyopencl does not support rectangular copies, we have to
+                # do a copy H->D in a temporary device buffer, and then call a
+                # kernel doing the rectangular D-D copy.
+                self.tmp_sino_device[:] = sino[:]
+                if self.is_cpu:
+                    self.tmp_sino_device.finish()
+                d_sino_ref = self.tmp_sino_device
+            else:
+                d_sino_ref = sino
+            # Rectangular copy D->D
+            self.copy2d(self.d_sino_padded, d_sino_ref, self.sino_shape)
+            if self.is_cpu:
+                self.d_sino_padded.finish()  # should not be required here
+        else:
+            # Numpy backend: FFT/mult/IFFT are done on host.
+            if not(isinstance(sino, np.ndarray)):
+                # Numpy backend + pyopencl input: need to copy D->H
+                self.tmp_sino_host[:] = sino[:]
+                h_sino_ref = self.tmp_sino_host
+            else:
+                h_sino_ref = sino
+            # Rectangular copy H->H
+            self.copy2d_host(self.d_sino_padded, h_sino_ref, self.sino_shape)
+
+    def _get_output_sino(self, output):
+        """
+        :param Union[numpy.dtype,None] output: sinogram output.
+        :return: sinogram
+        """
+        if output is None:
+            res = np.zeros(self.sino_shape, dtype=np.float32)
+        else:
+            res = output
+        if self.fft_backend == "opencl":
+            if isinstance(res, np.ndarray):
+                # OpenCL backend + numpy output: copy D->H
+                # As pyopencl does not support rectangular copies, we first have
+                # to call a kernel doing rectangular copy D->D, then do a copy
+                # D->H.
+                self.copy2d(dst=self.tmp_sino_device,
+                            src=self.d_sino_padded,
+                            transfer_shape=self.sino_shape)
+                if self.is_cpu:
+                    self.tmp_sino_device.finish()  # should not be required here
+                res[:] = self.tmp_sino_device.get()[:]
+            else:
+                if self.is_cpu:
+                    self.d_sino_padded.finish()
+                self.copy2d(res, self.d_sino_padded, self.sino_shape)
+                if self.is_cpu:
+                    res.finish()  # should not be required here
+        else:
+            if not(isinstance(res, np.ndarray)):
+                # Numpy backend + pyopencl output: rect copy H->H + copy H->D
+                self.copy2d_host(dst=self.tmp_sino_host,
+                                 src=self.d_sino_padded,
+                                 transfer_shape=self.sino_shape)
+                res[:] = self.tmp_sino_host[:]
+            else:
+                # Numpy backend + numpy output: rect copy H->H
+                self.copy2d_host(res, self.d_sino_padded, self.sino_shape)
+        return res
+
+    def _do_fft(self):
+        if self.fft_backend == "opencl":
+            self.fft.fft(self.d_sino_padded, output=self.d_sino_f)
+            if self.is_cpu:
+                self.d_sino_f.finish()
+        else:
+            # numpy backend does not support "output=" argument,
+            # and rfft always return a complex128 result.
+            res = self.fft.fft(self.d_sino_padded).astype(np.complex64)
+            self.d_sino_f[:] = res[:]
+
+    def _multiply_fourier(self):
+        if self.fft_backend == "opencl":
+            # Everything is on device. Call the multiplication kernel.
+            ev = self.kernels.inplace_complex_mul_2Dby1D(
+                *self.mult_kern_args
+            )
+            ev.wait()
+            if self.is_cpu:
+                self.d_sino_f.finish()  # should not be required here
+        else:
+            # Everything is on host.
+            self.d_sino_f *= self.filter_f
+
+    def _do_ifft(self):
+        if self.fft_backend == "opencl":
+            if self.is_cpu:
+                self.d_sino_padded.fill(0)
+                self.d_sino_padded.finish()
+            self.fft.ifft(self.d_sino_f, output=self.d_sino_padded)
+            if self.is_cpu:
+                self.d_sino_padded.finish()
+        else:
+            # numpy backend does not support "output=" argument,
+            # and irfft always return a float64 result.
+            res = self.fft.ifft(self.d_sino_f).astype("f")
+            self.d_sino_padded[:] = res[:]
+
+    def filter_sino(self, sino, output=None):
+        """
+
+        :param sino: sinogram
+        :param output:
+        :return: filtered sinogram
+        """
+        # Handle input sinogram
+        self._prepare_input_sino(sino)
+        # FFT
+        self._do_fft()
+        # multiply with filter in the Fourier domain
+        self._multiply_fourier()
+        # iFFT
+        self._do_ifft()
+        # return
+        res = self._get_output_sino(output)
+        return res
+        # ~ return output
+
+    __call__ = filter_sino
+
+
+
+
+# -------------------
+# - Compatibility  -
+# -------------------
+
+
+def nextpow2(N):
+    p = 1
+    while p < N:
+        p *= 2
+    return p
+
+
+@deprecated(replacement="Backprojection.sino_filter", since_version="0.10")
+def fourier_filter(sino, filter_=None, fft_size=None):
+    """Simple np based implementation of fourier space filter.
+    This function is deprecated, please use silx.opencl.sinofilter.SinoFilter.
+
+    :param sino: of shape shape = (num_projs, num_bins)
+    :param filter: filter function to apply in fourier space
+    :fft_size: size on which perform the fft. May be larger than the sino array
+    :return: filtered sinogram
+    """
+    assert sino.ndim == 2
+    num_projs, num_bins = sino.shape
+    if fft_size is None:
+        fft_size = nextpow2(num_bins * 2 - 1)
+    else:
+        assert fft_size >= num_bins
+    if fft_size == num_bins:
+        sino_zeropadded = sino.astype(np.float32)
+    else:
+        sino_zeropadded = np.zeros((num_projs, fft_size),
+                                      dtype=np.complex64)
+        sino_zeropadded[:, :num_bins] = sino.astype(np.float32)
+
+    if filter_ is None:
+        h = np.zeros(fft_size, dtype=np.float32)
+        L2 = fft_size // 2 + 1
+        h[0] = 1 / 4.
+        j = np.linspace(1, L2, L2 // 2, False)
+        h[1:L2:2] = -1. / (np.pi ** 2 * j ** 2)
+        h[L2:] = np.copy(h[1:L2 - 1][::-1])
+        filter_ = np.fft.fft(h).astype(np.complex64)
+
+    # Linear convolution
+    sino_f = np.fft.fft(sino, fft_size)
+    sino_f = sino_f * filter_
+    sino_filtered = np.fft.ifft(sino_f)[:, :num_bins].real
+
+    return np.ascontiguousarray(sino_filtered.real, dtype=np.float32)
diff --git a/src/silx/opencl/sparse.py b/src/silx/opencl/sparse.py
new file mode 100644
index 0000000..514589a
--- /dev/null
+++ b/src/silx/opencl/sparse.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2019 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for data sparsification on CPU/GPU."""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["P. Paleo"]
+__license__ = "MIT"
+__date__ = "07/06/2019"
+
+import numpy
+import pyopencl.array as parray
+from collections import namedtuple
+from pyopencl.scan import GenericScanKernel
+from pyopencl.tools import dtype_to_ctype
+from .common import pyopencl as cl
+from .processing import OpenclProcessing, EventDescription, BufferDescription
+mf = cl.mem_flags
+
+
+CSRData = namedtuple("CSRData", ["data", "indices", "indptr"])
+
+def tuple_to_csrdata(arrs):
+    """
+    Converts a 3-tuple to a CSRData namedtuple.
+    """
+    if arrs is None:
+        return None
+    return CSRData(data=arrs[0], indices=arrs[1], indptr=arrs[2])
+
+
+
+class CSR(OpenclProcessing):
+    kernel_files = ["sparse.cl"]
+
+    def __init__(self, shape, dtype="f", max_nnz=None, idx_dtype=numpy.int32,
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 block_size=None, memory=None, profile=False):
+        """
+        Compute Compressed Sparse Row format of an image (2D matrix).
+        It is designed to be compatible with scipy.sparse.csr_matrix.
+
+        :param shape: tuple
+            Matrix shape.
+        :param dtype: str or numpy.dtype, optional
+            Numeric data type. By default, sparse matrix data will be float32.
+        :param max_nnz: int, optional
+            Maximum number of non-zero elements. By default, the arrays "data"
+            and "indices" are allocated with prod(shape) elements, but
+            in practice a much lesser space is needed.
+            The number of non-zero items cannot be known in advance, but one can
+            estimate an upper-bound with this parameter to save memory.
+
+        Opencl processing parameters
+        -----------------------------
+        Please refer to the documentation of silx.opencl.processing.OpenclProcessing
+        for information on the other parameters.
+        """
+
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  block_size=block_size, memory=memory,
+                                  profile=profile)
+        self._set_parameters(shape, dtype, max_nnz, idx_dtype)
+        self._allocate_memory()
+        self._setup_kernels()
+
+    # --------------------------------------------------------------------------
+    # -------------------------- Initialization --------------------------------
+    # --------------------------------------------------------------------------
+
+    def _set_parameters(self, shape, dtype, max_nnz, idx_dtype):
+        self.shape = shape
+        self.size = numpy.prod(shape)
+        self._set_idx_dtype(idx_dtype)
+        assert len(shape) == 2 #
+        if max_nnz is None:
+            self.max_nnz = numpy.prod(shape) # worst case
+        else:
+            self.max_nnz = int(max_nnz)
+        self._set_dtype(dtype)
+
+
+    def _set_idx_dtype(self, idx_dtype):
+        idx_dtype = numpy.dtype(idx_dtype)
+        if idx_dtype.kind not in ["i", "u"]:
+            raise ValueError("Not an integer type: %s" % idx_dtype)
+        # scan value type must have size divisible by 4 bytes
+        if idx_dtype.itemsize % 4 != 0:
+            raise ValueError("Due to an internal pyopencl limitation, idx_dtype type must have size divisible by 4 bytes")
+        self.indice_dtype = idx_dtype #
+
+
+    def _set_dtype(self, dtype):
+        self.dtype = numpy.dtype(dtype)
+        if self.dtype.kind == "c":
+            raise ValueError("Complex data is not supported")
+        if self.dtype == numpy.dtype(numpy.float32):
+            self._c_zero_str = "0.0f"
+        elif self.dtype == numpy.dtype(numpy.float64):
+            self._c_zero_str = "0.0"
+        else: # assuming integer
+            self._c_zero_str = "0"
+        self.c_dtype = dtype_to_ctype(self.dtype)
+        self.idx_c_dtype = dtype_to_ctype(self.indice_dtype)
+
+
+    def _allocate_memory(self):
+        self.is_cpu = (self.device.type == "CPU") # move to OpenclProcessing ?
+        self.buffers = [
+            BufferDescription("array", (self.size,), self.dtype, mf.READ_ONLY),
+            BufferDescription("data", (self.max_nnz,), self.dtype, mf.READ_WRITE),
+            BufferDescription("indices", (self.max_nnz,), self.indice_dtype, mf.READ_WRITE),
+            BufferDescription("indptr", (self.shape[0]+1,), self.indice_dtype, mf.READ_WRITE),
+        ]
+        self.allocate_buffers(use_array=True)
+        for arr_name in ["array", "data", "indices", "indptr"]:
+            setattr(self, arr_name, self.cl_mem[arr_name])
+            self.cl_mem[arr_name].fill(0) # allocate_buffers() uses empty()
+        self._old_array = self.array
+        self._old_data = self.data
+        self._old_indices = self.indices
+        self._old_indptr = self.indptr
+
+
+    def _setup_kernels(self):
+        self._setup_compaction_kernel()
+        self._setup_decompaction_kernel()
+
+
+    def _setup_compaction_kernel(self):
+        kernel_signature = str(
+            "__global %s *data, \
+            __global %s *data_compacted, \
+            __global %s *indices, \
+            __global %s* indptr \
+            """ % (self.c_dtype, self.c_dtype, self.idx_c_dtype, self.idx_c_dtype)
+        )
+        if self.dtype.kind == "f":
+            map_nonzero_expr = "(fabs(data[i]) > %s) ? 1 : 0" % self._c_zero_str
+        elif self.dtype.kind in ["u", "i"]:
+            map_nonzero_expr = "(data[i] != %s) ? 1 : 0" % self._c_zero_str
+        else:
+            raise ValueError("Unknown data type")
+
+        self.scan_kernel = GenericScanKernel(
+            self.ctx, self.indice_dtype,
+            arguments=kernel_signature,
+            input_expr=map_nonzero_expr,
+            scan_expr="a+b", neutral="0",
+            output_statement="""
+                // item is the running sum of input_expr(i), i.e the cumsum of "nonzero"
+                if (prev_item != item) {
+                    data_compacted[item-1] = data[i];
+                    indices[item-1] = GET_INDEX(i);
+                }
+                // The last cumsum element of each line of "nonzero" goes to inptr[i]
+                if ((i+1) % IMAGE_WIDTH == 0) {
+                    indptr[(i/IMAGE_WIDTH)+1] = item;
+                }
+                """,
+            options=["-DIMAGE_WIDTH=%d" % self.shape[1]],
+            preamble="#define GET_INDEX(i) (i % IMAGE_WIDTH)",
+        )
+
+
+    def _setup_decompaction_kernel(self):
+        OpenclProcessing.compile_kernels(
+            self,
+            self.kernel_files,
+            compile_options=[
+                "-DIMAGE_WIDTH=%d" % self.shape[1],
+                "-DDTYPE=%s" % self.c_dtype,
+                "-DIDX_DTYPE=%s" % self.idx_c_dtype,
+            ]
+        )
+        device = self.ctx.devices[0]
+        wg_x = min(
+            device.max_work_group_size,
+            32,
+            self.kernels.max_workgroup_size("densify_csr")
+        )
+        self._decomp_wg = (wg_x, 1)
+        self._decomp_grid = (self._decomp_wg[0], self.shape[0])
+
+
+    # --------------------------------------------------------------------------
+    # -------------------------- Array utils -----------------------------------
+    # --------------------------------------------------------------------------
+
+    # TODO handle pyopencl Buffer
+    def check_array(self, arr):
+        """
+        Check that provided array is compatible with current context.
+
+        :param arr: numpy.ndarray or pyopencl.array.Array
+            2D array in dense format.
+        """
+        assert arr.size == self.size
+        assert arr.dtype == self.dtype
+
+
+    # TODO handle pyopencl Buffer
+    def check_sparse_arrays(self, csr_data):
+        """
+        Check that the provided sparse arrays are compatible with the current
+        context.
+
+        :param arrays: namedtuple CSRData.
+            It contains the arrays "data", "indices", "indptr"
+        """
+        assert isinstance(csr_data, CSRData)
+        for arr in [csr_data.data, csr_data.indices, csr_data.indptr]:
+            assert arr.ndim == 1
+        assert csr_data.data.size <= self.max_nnz
+        assert csr_data.indices.size <= self.max_nnz
+        assert csr_data.indptr.size == self.shape[0]+1
+        assert csr_data.data.dtype == self.dtype
+        assert csr_data.indices.dtype == self.indice_dtype
+        assert csr_data.indptr.dtype == self.indice_dtype
+
+
+    def set_array(self, arr):
+        """
+        Set the provided array as the current context 2D matrix.
+
+        :param arr: numpy.ndarray or pyopencl.array.Array
+            2D array in dense format.
+        """
+        if arr is None:
+            return
+        self.check_array(arr)
+        # GenericScanKernel only supports 1D data
+        if isinstance(arr, parray.Array):
+            self._old_array = self.array
+            self.array = arr
+        elif isinstance(arr, numpy.ndarray):
+            self.array[:] = arr.ravel()[:]
+        else:
+            raise ValueError("Expected pyopencl array or numpy array")
+
+
+    def set_sparse_arrays(self, csr_data):
+        if csr_data is None:
+            return
+        self.check_sparse_arrays(csr_data)
+        for name, arr in {"data": csr_data.data, "indices": csr_data.indices, "indptr": csr_data.indptr}.items():
+            # The current array is a device array. Don't copy, use it directly
+            if isinstance(arr, parray.Array):
+                setattr(self, "_old_" + name, getattr(self, name))
+                setattr(self, name, arr)
+            # The current array is a numpy.ndarray: copy H2D
+            elif isinstance(arr, numpy.ndarray):
+                getattr(self, name)[:arr.size] = arr[:]
+            else:
+                raise ValueError("Unsupported array type: %s" % type(arr))
+
+
+    def _recover_arrays_references(self):
+        """
+        Recover the previous arrays references, and return the references of the
+        "current" arrays.
+        """
+        array = self.array
+        data = self.data
+        indices = self.indices
+        indptr = self.indptr
+        for name in ["array", "data", "indices", "indptr"]:
+            # self.X = self._old_X
+            setattr(self, name, getattr(self, "_old_" + name))
+        return array, (data, indices, indptr)
+
+
+    def get_sparse_arrays(self, output):
+        """
+        Get the 2D dense array of the current context.
+
+        :param output: tuple or None
+            tuple in the form (data, indices, indptr). These arrays have to be
+            compatible with the current context (size and data type).
+            The content of these arrays will be overwritten with the result of
+            the previous computation.
+        """
+        numels = self.max_nnz
+        if output is None:
+            data = self.data.get()[:numels]
+            ind = self.indices.get()[:numels]
+            indptr = self.indptr.get()
+            res = (data, ind, indptr)
+        else:
+            res = output
+        return res
+
+
+    def get_array(self, output):
+        if output is None:
+            res = self.array.get().reshape(self.shape)
+        else:
+            res = output
+        return res
+
+    # --------------------------------------------------------------------------
+    # -------------------------- Compaction ------------------------------------
+    # --------------------------------------------------------------------------
+
+    def sparsify(self, arr, output=None):
+        """
+        Convert an image (2D matrix) into a CSR representation.
+
+        :param arr: numpy.ndarray or pyopencl.array.Array
+            Input array.
+        :param output: tuple of pyopencl.array.Array, optional
+            If provided, this must be a tuple of 3 arrays (data, indices, indptr).
+            The content of each array is overwritten by the computation result.
+        """
+        self.set_array(arr)
+        self.set_sparse_arrays(tuple_to_csrdata(output))
+        evt = self.scan_kernel(
+            self.array,
+            self.data,
+            self.indices,
+            self.indptr,
+        )
+        #~ evt.wait()
+        self.profile_add(evt, "sparsification kernel")
+        res = self.get_sparse_arrays(output)
+        self._recover_arrays_references()
+        return res
+
+    # --------------------------------------------------------------------------
+    # -------------------------- Decompaction ----------------------------------
+    # --------------------------------------------------------------------------
+
+    def densify(self, data, indices, indptr, output=None):
+        self.set_sparse_arrays(
+            CSRData(data=data, indices=indices, indptr=indptr)
+        )
+        self.set_array(output)
+        evt = self.kernels.densify_csr(
+            self.queue,
+            self._decomp_grid,
+            self._decomp_wg,
+            self.data.data,
+            self.indices.data,
+            self.indptr.data,
+            self.array.data,
+            numpy.int32(self.shape[0]),
+        )
+        #~ evt.wait()
+        self.profile_add(evt, "desparsification kernel")
+        res = self.get_array(output)
+        self._recover_arrays_references()
+        return res
+
diff --git a/src/silx/opencl/statistics.py b/src/silx/opencl/statistics.py
new file mode 100644
index 0000000..a96ee33
--- /dev/null
+++ b/src/silx/opencl/statistics.py
@@ -0,0 +1,242 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: SILX
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2019 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#  .
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#  .
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+"""A module for performing basic statistical analysis (min, max, mean, std) on
+large data where numpy is not very efficient.
+"""
+
+__author__ = "Jerome Kieffer"
+__license__ = "MIT"
+__date__ = "19/05/2021"
+__copyright__ = "2012-2019, ESRF, Grenoble"
+__contact__ = "jerome.kieffer@esrf.fr"
+
+import logging
+import numpy
+from collections import OrderedDict, namedtuple
+from math import sqrt
+
+from .common import pyopencl
+from .processing import EventDescription, OpenclProcessing, BufferDescription
+from .utils import concatenate_cl_kernel
+
+if pyopencl:
+    mf = pyopencl.mem_flags
+    from pyopencl.reduction import ReductionKernel
+    try:
+        from pyopencl import cltypes
+    except ImportError:
+        v = pyopencl.array.vec()
+        float8 = v.float8
+    else:
+        float8 = cltypes.float8
+
+else:
+    raise ImportError("pyopencl is not installed")
+logger = logging.getLogger(__name__)
+
+StatResults = namedtuple("StatResults", ["min", "max", "cnt", "sum", "mean",
+                                         "var", "std"])
+zero8 = "(float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)"
+#                    min      max    cnt  cnt_e  sum   sum_e  var  var_e
+
+
+class Statistics(OpenclProcessing):
+    """A class for doing statistical analysis using OpenCL
+
+    :param List[int] size: Shape of input data to treat
+    :param numpy.dtype dtype: Input data type
+    :param numpy.ndarray template: Data template to extract size & dtype
+    :param ctx: Actual working context, left to None for automatic
+                initialization from device type or platformid/deviceid
+    :param str devicetype: Type of device, can be "CPU", "GPU", "ACC" or "ALL"
+    :param int platformid: Platform identifier as given by clinfo
+    :param int deviceid: Device identifier as given by clinfo
+    :param int block_size:
+        Preferred workgroup size, may vary depending on the outcome of the compilation
+    :param bool profile:
+        Switch on profiling to be able to profile at the kernel level,
+        store profiling elements (makes code slightly slower)
+    """
+    buffers = [
+        BufferDescription("raw", 1, numpy.float32, mf.READ_ONLY),
+        BufferDescription("converted", 1, numpy.float32, mf.READ_WRITE),
+    ]
+    kernel_files = ["preprocess.cl"]
+    mapping = {numpy.int8: "s8_to_float",
+               numpy.uint8: "u8_to_float",
+               numpy.int16: "s16_to_float",
+               numpy.uint16: "u16_to_float",
+               numpy.uint32: "u32_to_float",
+               numpy.int32: "s32_to_float"}
+
+    def __init__(self, size=None, dtype=None, template=None,
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 block_size=None, profile=False
+                 ):
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  block_size=block_size, profile=profile)
+        self.size = size
+        self.dtype = dtype
+        if template is not None:
+            self.size = template.size
+            self.dtype = template.dtype
+
+        self.buffers = [BufferDescription(i.name, i.size * self.size, i.dtype, i.flags)
+                        for i in self.__class__.buffers]
+
+        self.allocate_buffers(use_array=True)
+        self.compile_kernels()
+        self.set_kernel_arguments()
+
+    def set_kernel_arguments(self):
+        """Parametrize all kernel arguments"""
+        for val in self.mapping.values():
+            self.cl_kernel_args[val] = OrderedDict(((i, self.cl_mem[i]) for i in ("raw", "converted")))
+
+    def compile_kernels(self):
+        """Compile the kernel"""
+        OpenclProcessing.compile_kernels(self,
+                                         self.kernel_files,
+                                         "-D NIMAGE=%i" % self.size)
+        compiler_options = self.get_compiler_options(x87_volatile=True)
+        src = concatenate_cl_kernel(("doubleword.cl", "statistics.cl"))
+        self.reduction_comp = ReductionKernel(self.ctx,
+                                              dtype_out=float8,
+                                              neutral=zero8,
+                                              map_expr="map_statistics(data, i)",
+                                              reduce_expr="reduce_statistics(a,b)",
+                                              arguments="__global float *data",
+                                              preamble=src,
+                                              options=compiler_options)
+        self.reduction_simple = ReductionKernel(self.ctx,
+                                                dtype_out=float8,
+                                                neutral=zero8,
+                                                map_expr="map_statistics(data, i)",
+                                                reduce_expr="reduce_statistics_simple(a,b)",
+                                                arguments="__global float *data",
+                                                preamble=src,
+                                                options=compiler_options)
+
+        if "cl_khr_fp64" in self.device.extensions:
+            self.reduction_double = ReductionKernel(self.ctx,
+                                                    dtype_out=float8,
+                                                    neutral=zero8,
+                                                    map_expr="map_statistics(data, i)",
+                                                    reduce_expr="reduce_statistics_double(a,b)",
+                                                    arguments="__global float *data",
+                                                    preamble=src,
+                                                    options=compiler_options)
+        else:
+            logger.info("Device %s does not support double-precision arithmetics, fall-back on compensated one", self.device)
+            self.reduction_double = self.reduction_comp
+
+    def send_buffer(self, data, dest):
+        """
+        Send a numpy array to the device, including the cast on the device if
+        possible
+
+        :param numpy.ndarray data: numpy array with data
+        :param dest: name of the buffer as registered in the class
+        """
+        logger.info("send data to %s", dest)
+        dest_type = numpy.dtype([i.dtype for i in self.buffers if i.name == dest][0])
+        events = []
+        if (data.dtype == dest_type) or (data.dtype.itemsize > dest_type.itemsize):
+            copy_image = pyopencl.enqueue_copy(self.queue,
+                                               self.cl_mem[dest].data,
+                                               numpy.ascontiguousarray(data, dest_type))
+            events.append(EventDescription("copy H->D %s" % dest, copy_image))
+        else:
+            copy_image = pyopencl.enqueue_copy(self.queue,
+                                               self.cl_mem["raw"].data,
+                                               numpy.ascontiguousarray(data))
+            kernel = getattr(self.program, self.mapping[data.dtype.type])
+            cast_to_float = kernel(self.queue,
+                                   (self.size,),
+                                   None,
+                                   self.cl_mem["raw"].data,
+                                   self.cl_mem[dest].data)
+            events += [
+                EventDescription("copy H->D raw", copy_image),
+                EventDescription(f"cast to float {dest}", cast_to_float)
+            ]
+        if self.profile:
+            self.events += events
+        return events
+
+    def process(self, data, comp=True):
+        """Actually calculate the statics on the data
+
+        :param numpy.ndarray data: numpy array with the image
+        :param comp: use Kahan compensated arithmetics for the calculation 
+        :return: Statistics named tuple
+        :rtype: StatResults
+        """
+        if data.ndim != 1:
+            data = data.ravel()
+        size = data.size
+        assert size <= self.size, "size is OK"
+        events = []
+        if comp is True:
+            comp = "comp"
+        elif comp is False:
+            comp = "single"
+        else:
+            comp = comp.lower()
+        with self.sem:
+            self.send_buffer(data, "converted")
+            if comp in ("single", "fp32", "float32"):
+                reduction = self.reduction_simple
+            elif comp in ("double", "fp64", "float64"):
+                reduction = self.reduction_double
+            else:
+                reduction = self.reduction_comp
+            res_d, evt = reduction(self.cl_mem["converted"][:self.size],
+                                   queue=self.queue,
+                                   return_event=True)
+            events.append(EventDescription(f"statistical reduction {comp}", evt))
+            if self.profile:
+                self.events += events
+            res_h = res_d.get()
+        min_ = 1.0 * res_h["s0"]
+        max_ = 1.0 * res_h["s1"]
+        count = 1.0 * res_h["s2"] + res_h["s3"]
+        sum_ = 1.0 * res_h["s4"] + res_h["s5"]
+        m2 = 1.0 * res_h["s6"] + res_h["s7"]
+        var = m2 / (count - 1.0)
+        res = StatResults(min_,
+                          max_,
+                          count,
+                          sum_,
+                          sum_ / count,
+                          var,
+                          sqrt(var))
+        return res
+
+    __call__ = process
diff --git a/src/silx/opencl/test/__init__.py b/src/silx/opencl/test/__init__.py
new file mode 100644
index 0000000..92cda4a
--- /dev/null
+++ b/src/silx/opencl/test/__init__.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: silx
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2016  European Synchrotron Radiation Facility, Grenoble, France
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
diff --git a/src/silx/opencl/test/test_addition.py b/src/silx/opencl/test/test_addition.py
new file mode 100644
index 0000000..3b668bf
--- /dev/null
+++ b/src/silx/opencl/test/test_addition.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Sift implementation in Python + OpenCL
+#             https://github.com/silx-kit/silx
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Simple test of an addition
+"""
+
+__authors__ = ["Henri Payno, Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "30/11/2020"
+
+import logging
+import numpy
+import pytest
+
+import unittest
+from ..common import ocl, _measure_workgroup_size, query_kernel_info
+if ocl:
+    import pyopencl
+    import pyopencl.array
+from ..utils import get_opencl_code
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipUnless(ocl, "PyOpenCl is missing")
+class TestAddition(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestAddition, cls).setUpClass()
+        if ocl:
+            cls.ctx = ocl.create_context()
+            if logger.getEffectiveLevel() <= logging.INFO:
+                cls.PROFILE = True
+                cls.queue = pyopencl.CommandQueue(
+                                cls.ctx,
+                                properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+            else:
+                cls.PROFILE = False
+                cls.queue = pyopencl.CommandQueue(cls.ctx)
+            cls.max_valid_wg = 0
+
+    @classmethod
+    def tearDownClass(cls):
+        super(TestAddition, cls).tearDownClass()
+        print("Maximum valid workgroup size %s on device %s" % (cls.max_valid_wg, cls.ctx.devices[0]))
+        cls.ctx = None
+        cls.queue = None
+
+    def setUp(self):
+        if ocl is None:
+            return
+        self.shape = 4096
+        self.data = numpy.random.random(self.shape).astype(numpy.float32)
+        self.d_array_img = pyopencl.array.to_device(self.queue, self.data)
+        self.d_array_5 = pyopencl.array.empty_like(self.d_array_img)
+        self.d_array_5.fill(-5)
+        self.program = pyopencl.Program(self.ctx, get_opencl_code("addition")).build()
+
+    def tearDown(self):
+        self.img = self.data = None
+        self.d_array_img = self.d_array_5 = self.program = None
+
+    def test_add(self):
+        """
+        tests the addition  kernel
+        """
+        maxi = int(round(numpy.log2(self.shape)))
+        for i in range(maxi):
+            d_array_result = pyopencl.array.empty_like(self.d_array_img)
+            wg = 1 << i
+            try:
+                evt = self.program.addition(self.queue, (self.shape,), (wg,),
+                       self.d_array_img.data, self.d_array_5.data, d_array_result.data, numpy.int32(self.shape))
+                evt.wait()
+            except Exception as error:
+                max_valid_wg = self.program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, self.ctx.devices[0])
+                msg = "Error %s on WG=%s: %s" % (error, wg, max_valid_wg)
+                self.assertLess(max_valid_wg, wg, msg)
+                break
+            else:
+                res = d_array_result.get()
+                good = numpy.allclose(res, self.data - 5)
+                if good and wg > self.max_valid_wg:
+                    self.__class__.max_valid_wg = wg
+                self.assertTrue(good, "calculation is correct for WG=%s" % wg)
+
+    def test_measurement(self):
+        """
+        tests that all devices are working properly ... lengthy and error prone
+        """
+        for platform in ocl.platforms:
+            for did, device in enumerate(platform.devices):
+                meas = _measure_workgroup_size((platform.id, device.id))
+                self.assertEqual(meas, device.max_work_group_size,
+                                 "Workgroup size for %s/%s: %s == %s" % (platform, device, meas, device.max_work_group_size))
+
+    def test_query(self):
+        """
+        tests that all devices are working properly ... lengthy and error prone
+        """
+        for what in ("COMPILE_WORK_GROUP_SIZE",
+                     "LOCAL_MEM_SIZE",
+                     "PREFERRED_WORK_GROUP_SIZE_MULTIPLE",
+                     "PRIVATE_MEM_SIZE",
+                     "WORK_GROUP_SIZE"):
+            logger.info("%s: %s", what, query_kernel_info(program=self.program, kernel="addition", what=what))
+
+        # Not all ICD work properly ....    
+        #self.assertEqual(3, len(query_kernel_info(program=self.program, kernel="addition", what="COMPILE_WORK_GROUP_SIZE")), "3D kernel")
+
+        min_wg = query_kernel_info(program=self.program, kernel="addition", what="PREFERRED_WORK_GROUP_SIZE_MULTIPLE")
+        max_wg = query_kernel_info(program=self.program, kernel="addition", what="WORK_GROUP_SIZE")
+        self.assertEqual(max_wg % min_wg, 0, msg="max_wg is a multiple of min_wg")
diff --git a/src/silx/opencl/test/test_array_utils.py b/src/silx/opencl/test/test_array_utils.py
new file mode 100644
index 0000000..325a6c3
--- /dev/null
+++ b/src/silx/opencl/test/test_array_utils.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Test of the OpenCL array_utils"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Pierre paleo"]
+__license__ = "MIT"
+__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "14/06/2017"
+
+
+import time
+import logging
+import numpy as np
+import unittest
+try:
+    import mako
+except ImportError:
+    mako = None
+from ..common import ocl
+if ocl:
+    import pyopencl as cl
+    import pyopencl.array as parray
+    from .. import linalg
+from ..utils import get_opencl_code
+from silx.test.utils import utilstest
+
+logger = logging.getLogger(__name__)
+try:
+    from scipy.ndimage.filters import laplace
+    _has_scipy = True
+except ImportError:
+    _has_scipy = False
+
+
+
+@unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
+class TestCpy2d(unittest.TestCase):
+
+    def setUp(self):
+        if ocl is None:
+            return
+        self.ctx = ocl.create_context()
+        if logger.getEffectiveLevel() <= logging.INFO:
+            self.PROFILE = True
+            self.queue = cl.CommandQueue(
+                            self.ctx,
+                            properties=cl.command_queue_properties.PROFILING_ENABLE)
+        else:
+            self.PROFILE = False
+            self.queue = cl.CommandQueue(self.ctx)
+        self.allocate_arrays()
+        self.program = cl.Program(self.ctx, get_opencl_code("array_utils")).build()
+
+    def allocate_arrays(self):
+        """
+        Allocate various types of arrays for the tests
+        """
+        self.prng_state = np.random.get_state()
+        # Generate arrays of random shape
+        self.shape1 = np.random.randint(20, high=512, size=(2,))
+        self.shape2 = np.random.randint(20, high=512, size=(2,))
+        self.array1 = np.random.rand(*self.shape1).astype(np.float32)
+        self.array2 = np.random.rand(*self.shape2).astype(np.float32)
+        self.d_array1 = parray.to_device(self.queue, self.array1)
+        self.d_array2 = parray.to_device(self.queue, self.array2)
+        # Generate random offsets
+        offset1_y = np.random.randint(2, high=min(self.shape1[0], self.shape2[0]) - 10)
+        offset1_x = np.random.randint(2, high=min(self.shape1[1], self.shape2[1]) - 10)
+        offset2_y = np.random.randint(2, high=min(self.shape1[0], self.shape2[0]) - 10)
+        offset2_x = np.random.randint(2, high=min(self.shape1[1], self.shape2[1]) - 10)
+        self.offset1 = (offset1_y, offset1_x)
+        self.offset2 = (offset2_y, offset2_x)
+        # Compute the size of the rectangle to transfer
+        size_y = np.random.randint(2, high=min(self.shape1[0], self.shape2[0]) - max(offset1_y, offset2_y) + 1)
+        size_x = np.random.randint(2, high=min(self.shape1[1], self.shape2[1]) - max(offset1_x, offset2_x) + 1)
+        self.transfer_shape = (size_y, size_x)
+
+    def tearDown(self):
+        self.array1 = None
+        self.array2 = None
+        self.d_array1.data.release()
+        self.d_array2.data.release()
+        self.d_array1 = None
+        self.d_array2 = None
+        self.ctx = None
+        self.queue = None
+
+    def compare(self, result, reference):
+        errmax = np.max(np.abs(result - reference))
+        logger.info("Max error = %e" % (errmax))
+        self.assertTrue(errmax == 0, str("Max error is too high"))#. PRNG state was %s" % str(self.prng_state)))
+
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_cpy2d(self):
+        """
+        Test rectangular transfer of self.d_array1 to self.d_array2
+        """
+        # Reference
+        o1 = self.offset1
+        o2 = self.offset2
+        T = self.transfer_shape
+        logger.info("""Testing D->D rectangular copy with (N1_y, N1_x) = %s,
+                    (N2_y, N2_x) = %s:
+                    array2[%d:%d, %d:%d] = array1[%d:%d, %d:%d]""" %
+                        (
+                            str(self.shape1), str(self.shape2),
+                            o2[0], o2[0] + T[0],
+                            o2[1], o2[1] + T[1],
+                            o1[0], o1[0] + T[0],
+                            o1[1], o1[1] + T[1]
+                        )
+                    )
+        self.array2[o2[0]:o2[0] + T[0], o2[1]:o2[1] + T[1]] = self.array1[o1[0]:o1[0] + T[0], o1[1]:o1[1] + T[1]]
+        kernel_args = (
+            self.d_array2.data,
+            self.d_array1.data,
+            np.int32(self.shape2[1]),
+            np.int32(self.shape1[1]),
+            np.int32(self.offset2[::-1]),
+            np.int32(self.offset1[::-1]),
+            np.int32(self.transfer_shape[::-1])
+        )
+        wg = None
+        ndrange = self.transfer_shape[::-1]
+        self.program.cpy2d(self.queue, ndrange, wg, *kernel_args)
+        res = self.d_array2.get()
+        self.compare(res, self.array2)
diff --git a/src/silx/opencl/test/test_backprojection.py b/src/silx/opencl/test/test_backprojection.py
new file mode 100644
index 0000000..96d56fa
--- /dev/null
+++ b/src/silx/opencl/test/test_backprojection.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Test of the filtered backprojection module"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Pierre paleo"]
+__license__ = "MIT"
+__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "19/01/2018"
+
+
+import time
+import logging
+import numpy as np
+import unittest
+from math import pi
+try:
+    import mako
+except ImportError:
+    mako = None
+from ..common import ocl
+if ocl:
+    from .. import backprojection
+    from ...image.tomography import compute_fourier_filter
+from silx.test.utils import utilstest
+
+logger = logging.getLogger(__name__)
+
+
+def generate_coords(img_shp, center=None):
+    """
+    Return two 2D arrays containing the indexes of an image.
+    The zero is at the center of the image.
+    """
+    l_r, l_c = float(img_shp[0]), float(img_shp[1])
+    R, C = np.mgrid[:l_r, :l_c]
+    if center is None:
+        center0, center1 = l_r / 2., l_c / 2.
+    else:
+        center0, center1 = center
+    R = R + 0.5 - center0
+    C = C + 0.5 - center1
+    return R, C
+
+
+def clip_circle(img, center=None, radius=None):
+    """
+    Puts zeros outside the inscribed circle of the image support.
+    """
+    R, C = generate_coords(img.shape, center)
+    M = R * R + C * C
+    res = np.zeros_like(img)
+    if radius is None:
+        radius = img.shape[0] / 2. - 1
+    mask = M < radius * radius
+    res[mask] = img[mask]
+    return res
+
+
+@unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
+class TestFBP(unittest.TestCase):
+
+    def setUp(self):
+        if ocl is None:
+            return
+        self.getfiles()
+        self.fbp = backprojection.Backprojection(self.sino.shape, profile=True)
+        if self.fbp.compiletime_workgroup_size < 16 * 16:
+            self.skipTest("Current implementation of OpenCL backprojection is "
+                          "not supported on this platform yet")
+        # Astra does not use the same backprojector implementation.
+        # Therefore, we cannot expect results to be the "same" (up to float32
+        # numerical error)
+        self.tol = 5e-2
+        if not(self.fbp._use_textures) or self.fbp.device.type == "CPU":
+            # Precision is less when using CPU
+            # (either CPU textures or "manual" linear interpolation)
+            self.tol *= 2
+
+    def tearDown(self):
+        self.sino = None
+        # self.fbp.log_profile()
+        self.fbp = None
+
+    def getfiles(self):
+        # load sinogram of 512x512 MRI phantom
+        self.sino = np.load(utilstest.getfile("sino500.npz"))["data"]
+        # load reconstruction made with ASTRA FBP (with filter designed in spatial domain)
+        self.reference_rec = np.load(utilstest.getfile("rec_astra_500.npz"))["data"]
+
+    def measure(self):
+        "Common measurement of timings"
+        t1 = time.time()
+        try:
+            result = self.fbp.filtered_backprojection(self.sino)
+        except RuntimeError as msg:
+            logger.error(msg)
+            return
+        t2 = time.time()
+        return t2 - t1, result
+
+    def compare(self, res):
+        """
+        Compare a result with the reference reconstruction.
+        Only the valid reconstruction zone (inscribed circle) is taken into
+        account
+        """
+        res_clipped = clip_circle(res)
+        ref_clipped = clip_circle(self.reference_rec)
+        delta = abs(res_clipped - ref_clipped)
+        bad = delta > 1
+        logger.debug("Absolute difference: %s with %s outlier pixels out of %s"
+                     "", delta.max(), bad.sum(), np.prod(bad.shape))
+        return delta.max()
+
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_fbp(self):
+        """
+        tests FBP
+        """
+        # Test single reconstruction
+        # --------------------------
+        t, res = self.measure()
+        if t is None:
+            logger.info("test_fp: skipped")
+        else:
+            logger.info("test_backproj: time = %.3fs" % t)
+            err = self.compare(res)
+            msg = str("Max error = %e" % err)
+            logger.info(msg)
+            self.assertTrue(err < self.tol, "Max error is too high")
+
+        # Test multiple reconstructions
+        # -----------------------------
+        res0 = np.copy(res)
+        for i in range(10):
+            res = self.fbp.filtered_backprojection(self.sino)
+            errmax = np.max(np.abs(res - res0))
+            self.assertTrue(errmax < 1.e-6, "Max error is too high")
+
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_fbp_filters(self):
+        """
+        Test the different available filters of silx FBP.
+        """
+        avail_filters = [
+            "ramlak", "shepp-logan", "cosine", "hamming",
+            "hann"
+        ]
+        # Create a Dirac delta function at a single angle view.
+        # As the filters are radially invarant:
+        #   - backprojection yields an image where each line is a Dirac.
+        #   - FBP yields an image where each line is the spatial filter
+        # One can simply filter "dirac" without backprojecting it, but this
+        # test will also ensure that backprojection behaves well.
+        dirac = np.zeros_like(self.sino)
+        na, dw = dirac.shape
+        dirac[0, dw//2] = na / pi * 2
+
+        for filter_name in avail_filters:
+            B = backprojection.Backprojection(dirac.shape, filter_name=filter_name)
+            r = B(dirac)
+            # Check that radial invariance is kept
+            std0 = np.max(np.abs(np.std(r, axis=0)))
+            self.assertTrue(
+                std0 < 5.e-6,
+                "Something wrong with FBP(filter=%s)" % filter_name
+            )
+            # Check that the filter is retrieved
+            r_f = np.fft.fft(np.fft.fftshift(r[0])).real / 2.  # filter factor
+            ref_filter_f = compute_fourier_filter(dw, filter_name)
+            errmax = np.max(np.abs(r_f - ref_filter_f))
+            logger.info("FBP filter %s: max error=%e" % (filter_name, errmax))
+            self.assertTrue(
+                errmax < 1.e-3,
+                "Something wrong with FBP(filter=%s)" % filter_name
+            )
+
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_fbp_oddsize(self):
+        # Generate a 513-sinogram.
+        # The padded width will be nextpow(513*2).
+        # silx [0.10, 0.10.1] will give 1029, which makes R2C transform fail.
+        sino = np.pad(self.sino, ((0, 0), (1, 0)), mode='edge')
+        B = backprojection.Backprojection(sino.shape, axis_position=self.fbp.axis_pos+1)
+        res = B(sino)
+        # Compare with self.reference_rec. Tolerance is high as backprojector
+        # is not fully shift-invariant.
+        errmax = np.max(np.abs(clip_circle(res[1:, 1:] - self.reference_rec)))
+        self.assertLess(
+            errmax, 1.e-1,
+            "Something wrong with FBP on odd-sized sinogram"
+        )
diff --git a/src/silx/opencl/test/test_convolution.py b/src/silx/opencl/test/test_convolution.py
new file mode 100644
index 0000000..6a2759d
--- /dev/null
+++ b/src/silx/opencl/test/test_convolution.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2019 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+
+"""
+Test of the Convolution class.
+"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Pierre Paleo"]
+__contact__ = "pierre.paleo@esrf.fr"
+__license__ = "MIT"
+__copyright__ = "2019 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "01/08/2019"
+
+import pytest
+import logging
+from itertools import product
+import numpy as np
+from silx.image.utils import gaussian_kernel
+
+try:
+    from scipy.ndimage import convolve, convolve1d
+    from scipy.misc import ascent
+
+    scipy_convolve = convolve
+    scipy_convolve1d = convolve1d
+except ImportError:
+    scipy_convolve = None
+import unittest
+from ..common import ocl, check_textures_availability
+
+if ocl:
+    import pyopencl as cl
+    import pyopencl.array as parray
+    from silx.opencl.convolution import Convolution
+logger = logging.getLogger(__name__)
+
+
+class ConvolutionData:
+
+    def __init__(self, param):
+        self.param = param
+        self.mode = param["boundary_handling"]
+        logger.debug(
+            """
+            Testing convolution with boundary_handling=%s,
+            use_textures=%s, input_device=%s, output_device=%s
+            """
+            % (
+                self.mode,
+                param["use_textures"],
+                param["input_on_device"],
+                param["output_on_device"],
+            )
+        )
+
+    @classmethod
+    def setUpClass(cls):
+        cls.image = np.ascontiguousarray(ascent()[:, :511], dtype="f")
+        cls.data1d = cls.image[0]
+        cls.data2d = cls.image
+        cls.data3d = np.tile(cls.image[224:-224, 224:-224], (62, 1, 1))
+        cls.kernel1d = gaussian_kernel(1.0)
+        cls.kernel2d = np.outer(cls.kernel1d, cls.kernel1d)
+        cls.kernel3d = np.multiply.outer(cls.kernel2d, cls.kernel1d)
+        cls.ctx = ocl.create_context()
+        cls.tol = {
+            "1D": 1e-4,
+            "2D": 1e-3,
+            "3D": 1e-3,
+        }
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.data1d = cls.data2d = cls.data3d = cls.image = None
+        cls.kernel1d = cls.kernel2d = cls.kernel3d = None
+
+    @staticmethod
+    def compare(arr1, arr2):
+        return np.max(np.abs(arr1 - arr2))
+
+    @staticmethod
+    def print_err(conv):
+        errmsg = str(
+            """
+            Something wrong with %s
+            mode=%s, texture=%s
+            """
+            % (conv.use_case_desc, conv.mode, conv.use_textures)
+        )
+        return errmsg
+
+    def instantiate_convol(self, shape, kernel, axes=None):
+        if self.mode == "constant":
+            if not (self.param["use_textures"]) or (
+                self.param["use_textures"]
+                and not (check_textures_availability(self.ctx))
+            ):
+                pytest.skip("mode=constant not implemented without textures")
+        C = Convolution(
+            shape,
+            kernel,
+            mode=self.mode,
+            ctx=self.ctx,
+            axes=axes,
+            extra_options={"dont_use_textures": not (self.param["use_textures"])},
+        )
+        return C
+
+    def get_data_and_kernel(self, test_name):
+        dims = {
+            "test_1D": (1, 1),
+            "test_separable_2D": (2, 1),
+            "test_separable_3D": (3, 1),
+            "test_nonseparable_2D": (2, 2),
+            "test_nonseparable_3D": (3, 3),
+        }
+        dim_data = {1: self.data1d, 2: self.data2d, 3: self.data3d}
+        dim_kernel = {
+            1: self.kernel1d,
+            2: self.kernel2d,
+            3: self.kernel3d,
+        }
+        dd, kd = dims[test_name]
+        return dim_data[dd], dim_kernel[kd]
+
+    def get_reference_function(self, test_name):
+        ref_func = {
+            "test_1D": lambda x, y: scipy_convolve1d(x, y, mode=self.mode),
+            "test_separable_2D": lambda x, y: scipy_convolve1d(
+                scipy_convolve1d(x, y, mode=self.mode, axis=1),
+                y,
+                mode=self.mode,
+                axis=0,
+            ),
+            "test_separable_3D": lambda x, y: scipy_convolve1d(
+                scipy_convolve1d(
+                    scipy_convolve1d(x, y, mode=self.mode, axis=2),
+                    y,
+                    mode=self.mode,
+                    axis=1,
+                ),
+                y,
+                mode=self.mode,
+                axis=0,
+            ),
+            "test_nonseparable_2D": lambda x, y: scipy_convolve(x, y, mode=self.mode),
+            "test_nonseparable_3D": lambda x, y: scipy_convolve(x, y, mode=self.mode),
+        }
+        return ref_func[test_name]
+
+    def template_test(self, test_name):
+        data, kernel = self.get_data_and_kernel(test_name)
+        conv = self.instantiate_convol(data.shape, kernel)
+        if self.param["input_on_device"]:
+            data_ref = parray.to_device(conv.queue, data)
+        else:
+            data_ref = data
+        if self.param["output_on_device"]:
+            d_res = parray.empty_like(conv.data_out)
+            d_res.fill(0)
+            res = d_res
+        else:
+            res = None
+        res = conv(data_ref, output=res)
+        if self.param["output_on_device"]:
+            res = res.get()
+        ref_func = self.get_reference_function(test_name)
+        ref = ref_func(data, kernel)
+        metric = self.compare(res, ref)
+        logger.info("%s: max error = %.2e" % (test_name, metric))
+        tol = self.tol[str("%dD" % kernel.ndim)]
+        assert metric < tol, self.print_err(conv)
+
+
+def convolution_data_params():
+    boundary_handlings = ["reflect", "nearest", "wrap", "constant"]
+    use_textures = [True, False]
+    input_on_devices = [True, False]
+    output_on_devices = [True, False]
+    param_vals = list(
+        product(boundary_handlings, use_textures, input_on_devices, output_on_devices)
+    )
+    params = []
+    for boundary_handling, use_texture, input_dev, output_dev in param_vals:
+        param={
+            "boundary_handling": boundary_handling,
+            "input_on_device": input_dev,
+            "output_on_device": output_dev,
+            "use_textures": use_texture,
+        }
+        params.append(param)
+
+    return params
+
+
+@pytest.fixture(scope="module", params=convolution_data_params())
+def convolution_data(request):
+    """Provide a set of convolution data
+
+    The module scope allows to test each function during a single setup of each
+    convolution data
+    """
+    cdata = None
+    try:
+        cdata = ConvolutionData(request.param)
+        cdata.setUpClass()
+        yield cdata
+    finally:
+        cdata.tearDownClass()
+
+
+@pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
+@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
+def test_1D(convolution_data):
+    convolution_data.template_test("test_1D")
+
+@pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
+@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
+def test_separable_2D(convolution_data):
+    convolution_data.template_test("test_separable_2D")
+
+@pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
+@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
+def test_separable_3D(convolution_data):
+    convolution_data.template_test("test_separable_3D")
+
+@pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
+@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
+def test_nonseparable_2D(convolution_data):
+    convolution_data.template_test("test_nonseparable_2D")
+
+@pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
+@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
+def test_nonseparable_3D(convolution_data):
+    convolution_data.template_test("test_nonseparable_3D")
+
+@pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
+@pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
+def test_batched_2D(convolution_data):
+    """
+    Test batched (nonseparable) 2D convolution on 3D data.
+    In this test: batch along "z" (axis 0)
+    """
+    data = convolution_data.data3d
+    kernel = convolution_data.kernel2d
+    conv = convolution_data.instantiate_convol(data.shape, kernel, axes=(0,))
+    res = conv(data)  # 3D
+    ref = scipy_convolve(data[0], kernel, mode=convolution_data.mode)  # 2D
+
+    std = np.std(res, axis=0)
+    std_max = np.max(np.abs(std))
+    assert std_max < convolution_data.tol["2D"], convolution_data.print_err(conv)
+    metric = convolution_data.compare(res[0], ref)
+    logger.info("test_nonseparable_3D: max error = %.2e" % metric)
+    assert metric < convolution_data.tol["2D"], convolution_data.print_err(conv)
diff --git a/src/silx/opencl/test/test_doubleword.py b/src/silx/opencl/test/test_doubleword.py
new file mode 100644
index 0000000..a33cf5a
--- /dev/null
+++ b/src/silx/opencl/test/test_doubleword.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python
+# coding: utf-8
+#
+#    Project: The silx project
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2021-2021 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"test suite for OpenCL code"
+
+__author__ = "Jérôme Kieffer"
+__contact__ = "Jerome.Kieffer@ESRF.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "31/05/2021"
+
+import unittest
+import numpy
+import logging
+import platform
+
+logger = logging.getLogger(__name__)
+try:
+    import pyopencl
+except ImportError as error:
+    logger.warning("OpenCL module (pyopencl) is not present, skip tests. %s.", error)
+    pyopencl = None
+
+from .. import ocl
+if ocl is not None:
+    from ..utils import read_cl_file
+    from .. import pyopencl
+    import pyopencl.array
+    from pyopencl.elementwise import ElementwiseKernel
+
+EPS32 = numpy.finfo("float32").eps
+EPS64 = numpy.finfo("float64").eps
+
+
+@unittest.skipUnless(ocl, "PyOpenCl is missing")
+class TestDoubleWord(unittest.TestCase):
+    """
+    Test the kernels for compensated math in OpenCL
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if pyopencl is None or ocl is None:
+            raise unittest.SkipTest("OpenCL module (pyopencl) is not present or no device available")
+
+        cls.ctx = ocl.create_context(devicetype="GPU")
+        cls.queue = pyopencl.CommandQueue(cls.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+
+        # this is running 32 bits OpenCL woth POCL
+        if (platform.machine() in ("i386", "i686", "x86_64") and (tuple.__itemsize__ == 4) and
+                cls.ctx.devices[0].platform.name == 'Portable Computing Language'):
+            cls.args = "-DX87_VOLATILE=volatile"
+        else:
+            cls.args = ""
+        size = 1024
+        cls.a = 1.0 + numpy.random.random(size)
+        cls.b = 1.0 + numpy.random.random(size)
+        cls.ah = cls.a.astype(numpy.float32)
+        cls.bh = cls.b.astype(numpy.float32)
+        cls.al = (cls.a - cls.ah).astype(numpy.float32)
+        cls.bl = (cls.b - cls.bh).astype(numpy.float32)
+        cls.doubleword = read_cl_file("doubleword.cl")
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.queue = None
+        cls.ctx = None
+        cls.a = cls.al = cls.ah = None
+        cls.b = cls.bl = cls.bh = None
+        cls.doubleword = None
+
+    def test_fast_sum2(self):
+        test_kernel = ElementwiseKernel(self.ctx,
+                      "float *a, float *b, float *res_h, float *res_l",
+                      "float2 tmp = fast_fp_plus_fp(a[i], b[i]); res_h[i] = tmp.s0; res_l[i] = tmp.s1",
+                      preamble=self.doubleword)
+        a_g = pyopencl.array.to_device(self.queue, self.ah)
+        b_g = pyopencl.array.to_device(self.queue, self.bl)
+        res_l = pyopencl.array.empty_like(a_g)
+        res_h = pyopencl.array.empty_like(a_g)
+        test_kernel(a_g, b_g, res_h, res_l)
+        self.assertEqual(abs(self.ah + self.bl - res_h.get()).max(), 0, "Major matches")
+        self.assertGreater(abs(self.ah.astype(numpy.float64) + self.bl - res_h.get()).max(), 0, "Exact mismatches")
+        self.assertEqual(abs(self.ah.astype(numpy.float64) + self.bl - (res_h.get().astype(numpy.float64) + res_l.get())).max(), 0, "Exact matches")
+
+    def test_sum2(self):
+        test_kernel = ElementwiseKernel(self.ctx,
+                    "float *a, float *b, float *res_h, float *res_l",
+                    "float2 tmp = fp_plus_fp(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+                    preamble=self.doubleword)
+        a_g = pyopencl.array.to_device(self.queue, self.ah)
+        b_g = pyopencl.array.to_device(self.queue, self.bh)
+        res_l = pyopencl.array.empty_like(a_g)
+        res_h = pyopencl.array.empty_like(a_g)
+        test_kernel(a_g, b_g, res_h, res_l)
+        self.assertEqual(abs(self.ah + self.bh - res_h.get()).max(), 0, "Major matches")
+        self.assertGreater(abs(self.ah.astype(numpy.float64) + self.bh - res_h.get()).max(), 0, "Exact mismatches")
+        self.assertEqual(abs(self.ah.astype(numpy.float64) + self.bh - (res_h.get().astype(numpy.float64) + res_l.get())).max(), 0, "Exact matches")
+
+    def test_prod2(self):
+        test_kernel = ElementwiseKernel(self.ctx,
+                    "float *a, float *b, float *res_h, float *res_l",
+                    "float2 tmp = fp_times_fp(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+                    preamble=self.doubleword)
+        a_g = pyopencl.array.to_device(self.queue, self.ah)
+        b_g = pyopencl.array.to_device(self.queue, self.bh)
+        res_l = pyopencl.array.empty_like(a_g)
+        res_h = pyopencl.array.empty_like(a_g)
+        test_kernel(a_g, b_g, res_h, res_l)
+        res_m = res_h.get()
+        res = res_h.get().astype(numpy.float64) + res_l.get()
+        self.assertEqual(abs(self.ah * self.bh - res_m).max(), 0, "Major matches")
+        self.assertGreater(abs(self.ah.astype(numpy.float64) * self.bh - res_m).max(), 0, "Exact mismatches")
+        self.assertEqual(abs(self.ah.astype(numpy.float64) * self.bh - res).max(), 0, "Exact matches")
+
+    def test_dw_plus_fp(self):
+        test_kernel = ElementwiseKernel(self.ctx,
+                    "float *ah, float *al, float *b, float *res_h, float *res_l",
+                    "float2 tmp = dw_plus_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+                    preamble=self.doubleword)
+        ah_g = pyopencl.array.to_device(self.queue, self.ah)
+        al_g = pyopencl.array.to_device(self.queue, self.al)
+        b_g = pyopencl.array.to_device(self.queue, self.bh)
+        res_l = pyopencl.array.empty_like(b_g)
+        res_h = pyopencl.array.empty_like(b_g)
+        test_kernel(ah_g, al_g, b_g, res_h, res_l)
+        res_m = res_h.get()
+        res = res_h.get().astype(numpy.float64) + res_l.get()
+        self.assertLess(abs(self.a + self.bh - res_m).max(), EPS32, "Major matches")
+        self.assertGreater(abs(self.a + self.bh - res_m).max(), EPS64, "Exact mismatches")
+        self.assertLess(abs(self.ah.astype(numpy.float64) + self.al + self.bh - res).max(), 2 * EPS32 ** 2, "Exact matches")
+
+    def test_dw_plus_dw(self):
+        test_kernel = ElementwiseKernel(self.ctx,
+                    "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l",
+                    "float2 tmp = dw_plus_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+                    preamble=self.doubleword)
+        ah_g = pyopencl.array.to_device(self.queue, self.ah)
+        al_g = pyopencl.array.to_device(self.queue, self.al)
+        bh_g = pyopencl.array.to_device(self.queue, self.bh)
+        bl_g = pyopencl.array.to_device(self.queue, self.bl)
+        res_l = pyopencl.array.empty_like(bh_g)
+        res_h = pyopencl.array.empty_like(bh_g)
+        test_kernel(ah_g, al_g, bh_g, bl_g, res_h, res_l)
+        res_m = res_h.get()
+        res = res_h.get().astype(numpy.float64) + res_l.get()
+        self.assertLess(abs(self.a + self.b - res_m).max(), EPS32, "Major matches")
+        self.assertGreater(abs(self.a + self.b - res_m).max(), EPS64, "Exact mismatches")
+        self.assertLess(abs(self.a + self.b - res).max(), 3 * EPS32 ** 2, "Exact matches")
+
+    def test_dw_times_fp(self):
+        test_kernel = ElementwiseKernel(self.ctx,
+                    "float *ah, float *al, float *b, float *res_h, float *res_l",
+                    "float2 tmp = dw_times_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+                    preamble=self.doubleword)
+        ah_g = pyopencl.array.to_device(self.queue, self.ah)
+        al_g = pyopencl.array.to_device(self.queue, self.al)
+        b_g = pyopencl.array.to_device(self.queue, self.bh)
+        res_l = pyopencl.array.empty_like(b_g)
+        res_h = pyopencl.array.empty_like(b_g)
+        test_kernel(ah_g, al_g, b_g, res_h, res_l)
+        res_m = res_h.get()
+        res = res_h.get().astype(numpy.float64) + res_l.get()
+        self.assertLess(abs(self.a * self.bh - res_m).max(), EPS32, "Major matches")
+        self.assertGreater(abs(self.a * self.bh - res_m).max(), EPS64, "Exact mismatches")
+        self.assertLess(abs(self.a * self.bh - res).max(), 2 * EPS32 ** 2, "Exact matches")
+
+    def test_dw_times_dw(self):
+        test_kernel = ElementwiseKernel(self.ctx,
+                    "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l",
+                    "float2 tmp = dw_times_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+                    preamble=self.doubleword)
+        ah_g = pyopencl.array.to_device(self.queue, self.ah)
+        al_g = pyopencl.array.to_device(self.queue, self.al)
+        bh_g = pyopencl.array.to_device(self.queue, self.bh)
+        bl_g = pyopencl.array.to_device(self.queue, self.bl)
+        res_l = pyopencl.array.empty_like(bh_g)
+        res_h = pyopencl.array.empty_like(bh_g)
+        test_kernel(ah_g, al_g, bh_g, bl_g, res_h, res_l)
+        res_m = res_h.get()
+        res = res_h.get().astype(numpy.float64) + res_l.get()
+        self.assertLess(abs(self.a * self.b - res_m).max(), EPS32, "Major matches")
+        self.assertGreater(abs(self.a * self.b - res_m).max(), EPS64, "Exact mismatches")
+        self.assertLess(abs(self.a * self.b - res).max(), 5 * EPS32 ** 2, "Exact matches")
+
+    def test_dw_div_fp(self):
+        test_kernel = ElementwiseKernel(self.ctx,
+                    "float *ah, float *al, float *b, float *res_h, float *res_l",
+                    "float2 tmp = dw_div_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+                    preamble=self.doubleword)
+        ah_g = pyopencl.array.to_device(self.queue, self.ah)
+        al_g = pyopencl.array.to_device(self.queue, self.al)
+        b_g = pyopencl.array.to_device(self.queue, self.bh)
+        res_l = pyopencl.array.empty_like(b_g)
+        res_h = pyopencl.array.empty_like(b_g)
+        test_kernel(ah_g, al_g, b_g, res_h, res_l)
+        res_m = res_h.get()
+        res = res_h.get().astype(numpy.float64) + res_l.get()
+        self.assertLess(abs(self.a / self.bh - res_m).max(), EPS32, "Major matches")
+        self.assertGreater(abs(self.a / self.bh - res_m).max(), EPS64, "Exact mismatches")
+        self.assertLess(abs(self.a / self.bh - res).max(), 3 * EPS32 ** 2, "Exact matches")
+
+    def test_dw_div_dw(self):
+        test_kernel = ElementwiseKernel(self.ctx,
+                    "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l",
+                    "float2 tmp = dw_div_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+                    preamble=self.doubleword)
+        ah_g = pyopencl.array.to_device(self.queue, self.ah)
+        al_g = pyopencl.array.to_device(self.queue, self.al)
+        bh_g = pyopencl.array.to_device(self.queue, self.bh)
+        bl_g = pyopencl.array.to_device(self.queue, self.bl)
+        res_l = pyopencl.array.empty_like(bh_g)
+        res_h = pyopencl.array.empty_like(bh_g)
+        test_kernel(ah_g, al_g, bh_g, bl_g, res_h, res_l)
+        res_m = res_h.get()
+        res = res_h.get().astype(numpy.float64) + res_l.get()
+        self.assertLess(abs(self.a / self.b - res_m).max(), EPS32, "Major matches")
+        self.assertGreater(abs(self.a / self.b - res_m).max(), EPS64, "Exact mismatches")
+        self.assertLess(abs(self.a / self.b - res).max(), 6 * EPS32 ** 2, "Exact matches")
diff --git a/src/silx/opencl/test/test_image.py b/src/silx/opencl/test/test_image.py
new file mode 100644
index 0000000..73c771b
--- /dev/null
+++ b/src/silx/opencl/test/test_image.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: image manipulation in  OpenCL
+#             https://github.com/silx-kit/silx
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Simple test of image manipulation
+"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2017 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "13/02/2018"
+
+import logging
+import numpy
+
+import unittest
+from ..common import ocl, _measure_workgroup_size
+if ocl:
+    import pyopencl
+    import pyopencl.array
+from ...test.utils import utilstest
+from ..image import ImageProcessing
+logger = logging.getLogger(__name__)
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+
+
+@unittest.skipUnless(ocl and Image, "PyOpenCl/Image is missing")
+class TestImage(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestImage, cls).setUpClass()
+        if ocl:
+            cls.ctx = ocl.create_context()
+            cls.lena = utilstest.getfile("lena.png")
+            cls.data = numpy.asarray(Image.open(cls.lena))
+            cls.ip = ImageProcessing(ctx=cls.ctx, template=cls.data, profile=True)
+
+    @classmethod
+    def tearDownClass(cls):
+        super(TestImage, cls).tearDownClass()
+        cls.ctx = None
+        cls.lena = None
+        cls.data = None
+        if logger.level <= logging.INFO:
+            logger.warning("\n".join(cls.ip.log_profile()))
+        cls.ip = None
+
+    def setUp(self):
+        if ocl is None:
+            return
+        self.data = numpy.asarray(Image.open(self.lena))
+
+    def tearDown(self):
+        self.img = self.data = None
+
+    @unittest.skipUnless(ocl, "pyopencl is missing")
+    def test_cast(self):
+        """
+        tests the cast kernel
+        """
+        res = self.ip.to_float(self.data)
+        self.assertEqual(res.shape, self.data.shape, "shape")
+        self.assertEqual(res.dtype, numpy.float32, "dtype")
+        self.assertEqual(abs(res - self.data).max(), 0, "content")
+
+    @unittest.skipUnless(ocl, "pyopencl is missing")
+    def test_normalize(self):
+        """
+        tests that all devices are working properly ...
+        """
+        tmp = pyopencl.array.empty(self.ip.ctx, self.data.shape, "float32")
+        res = self.ip.to_float(self.data, out=tmp)
+        res2 = self.ip.normalize(tmp, -100, 100, copy=False)
+        norm = (self.data.astype(numpy.float32) - self.data.min()) / (self.data.max() - self.data.min())
+        ref2 = 200 * norm - 100
+        self.assertLess(abs(res2 - ref2).max(), 3e-5, "content")
+
+    @unittest.skipUnless(ocl, "pyopencl is missing")
+    def test_histogram(self):
+        """
+        Test on a greyscaled image ... of Lena :)
+        """
+        lena_bw = (0.2126 * self.data[:, :, 0] +
+                   0.7152 * self.data[:, :, 1] +
+                   0.0722 * self.data[:, :, 2]).astype("int32")
+        ref = numpy.histogram(lena_bw, 255)
+        ip = ImageProcessing(ctx=self.ctx, template=lena_bw, profile=True)
+        res = ip.histogram(lena_bw, 255)
+        ip.log_profile()
+        delta = (ref[0] - res[0])
+        deltap = (ref[1] - res[1])
+        self.assertEqual(delta.sum(), 0, "errors are self-compensated")
+        self.assertLessEqual(abs(delta).max(), 1, "errors are small")
+        self.assertLessEqual(abs(deltap).max(), 3e-5, "errors on position are small: %s" % (abs(deltap).max()))
diff --git a/src/silx/opencl/test/test_kahan.py b/src/silx/opencl/test/test_kahan.py
new file mode 100644
index 0000000..9e4a1e3
--- /dev/null
+++ b/src/silx/opencl/test/test_kahan.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python
+# coding: utf-8
+#
+#    Project: OpenCL numerical library
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2015-2021 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"test suite for OpenCL code"
+
+__author__ = "Jérôme Kieffer"
+__contact__ = "Jerome.Kieffer@ESRF.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "17/05/2021"
+
+
+import unittest
+import numpy
+import logging
+import platform
+
+logger = logging.getLogger(__name__)
+try:
+    import pyopencl
+except ImportError as error:
+    logger.warning("OpenCL module (pyopencl) is not present, skip tests. %s.", error)
+    pyopencl = None
+
+from .. import ocl
+if ocl is not None:
+    from ..utils import read_cl_file
+    from .. import pyopencl
+    import pyopencl.array
+
+
+class TestKahan(unittest.TestCase):
+    """
+    Test the kernels for compensated math in OpenCL
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if pyopencl is None or ocl is None:
+            raise unittest.SkipTest("OpenCL module (pyopencl) is not present or no device available")
+
+        cls.ctx = ocl.create_context(devicetype="GPU")
+        cls.queue = pyopencl.CommandQueue(cls.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+
+        # this is running 32 bits OpenCL woth POCL
+        if (platform.machine() in ("i386", "i686", "x86_64") and (tuple.__itemsize__ == 4) and
+                cls.ctx.devices[0].platform.name == 'Portable Computing Language'):
+            cls.args = "-DX87_VOLATILE=volatile"
+        else:
+            cls.args = ""
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.queue = None
+        cls.ctx = None
+
+    @staticmethod
+    def dummy_sum(ary, dtype=None):
+        "perform the actual sum in a dummy way "
+        if dtype is None:
+            dtype = ary.dtype.type
+        sum_ = dtype(0)
+        for i in ary:
+            sum_ += i
+        return sum_
+
+    def test_kahan(self):
+        # simple test
+        N = 26
+        data = (1 << (N - 1 - numpy.arange(N))).astype(numpy.float32)
+
+        ref64 = numpy.sum(data, dtype=numpy.float64)
+        ref32 = self.dummy_sum(data)
+        if (ref64 == ref32):
+            logger.warning("Kahan: invalid tests as float32 provides the same result as float64")
+        # Dummy kernel to evaluate
+        src = """
+        kernel void summation(global float* data,
+                                           int size,
+                                    global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            for (int i=0; i<size; i++)
+            {
+                acc = kahan_sum(acc, data[i]);
+            }
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+        """
+        prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(self.args)
+        ones_d = pyopencl.array.to_device(self.queue, data)
+        res_d = pyopencl.array.empty(self.queue, 2, numpy.float32)
+        res_d.fill(0)
+        evt = prg.summation(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype=numpy.float64)
+        self.assertEqual(ref64, res, "test_kahan")
+
+    def test_dot16(self):
+        # simple test
+        N = 16
+        data = (1 << (N - 1 - numpy.arange(N))).astype(numpy.float32)
+
+        ref64 = numpy.dot(data.astype(numpy.float64), data.astype(numpy.float64))
+        ref32 = numpy.dot(data, data)
+        if (ref64 == ref32):
+            logger.warning("dot16: invalid tests as float32 provides the same result as float64")
+        # Dummy kernel to evaluate
+        src = """
+        kernel void test_dot16(global float* data,
+                                           int size,
+                               global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            float16 data16 = (float16) (data[0],data[1],data[2],data[3],data[4],
+                                        data[5],data[6],data[7],data[8],data[9],
+                         data[10],data[11],data[12],data[13],data[14],data[15]);
+            acc = comp_dot16(data16, data16);
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+
+        kernel void test_dot8(global float* data,
+                                           int size,
+                               global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            float8 data0 = (float8) (data[0],data[2],data[4],data[6],data[8],data[10],data[12],data[14]);
+            float8 data1 = (float8) (data[1],data[3],data[5],data[7],data[9],data[11],data[13],data[15]);
+            acc = comp_dot8(data0, data1);
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+
+        kernel void test_dot4(global float* data,
+                                           int size,
+                               global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            float4 data0 = (float4) (data[0],data[4],data[8],data[12]);
+            float4 data1 = (float4) (data[3],data[7],data[11],data[15]);
+            acc = comp_dot4(data0, data1);
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+
+        kernel void test_dot3(global float* data,
+                                           int size,
+                               global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            float3 data0 = (float3) (data[0],data[4],data[12]);
+            float3 data1 = (float3) (data[3],data[11],data[15]);
+            acc = comp_dot3(data0, data1);
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+
+        kernel void test_dot2(global float* data,
+                                           int size,
+                               global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            float2 data0 = (float2) (data[0],data[14]);
+            float2 data1 = (float2) (data[1],data[15]);
+            acc = comp_dot2(data0, data1);
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+
+        """
+
+        prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(self.args)
+        ones_d = pyopencl.array.to_device(self.queue, data)
+        res_d = pyopencl.array.empty(self.queue, 2, numpy.float32)
+        res_d.fill(0)
+        evt = prg.test_dot16(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype="float64")
+        self.assertEqual(ref64, res, "test_dot16")
+
+        res_d.fill(0)
+        data0 = data[0::2]
+        data1 = data[1::2]
+        ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
+        ref32 = numpy.dot(data0, data1)
+        if (ref64 == ref32):
+            logger.warning("dot8: invalid tests as float32 provides the same result as float64")
+        evt = prg.test_dot8(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype="float64")
+        self.assertEqual(ref64, res, "test_dot8")
+
+        res_d.fill(0)
+        data0 = data[0::4]
+        data1 = data[3::4]
+        ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
+        ref32 = numpy.dot(data0, data1)
+        if (ref64 == ref32):
+            logger.warning("dot4: invalid tests as float32 provides the same result as float64")
+        evt = prg.test_dot4(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype="float64")
+        self.assertEqual(ref64, res, "test_dot4")
+
+        res_d.fill(0)
+        data0 = numpy.array([data[0], data[4], data[12]])
+        data1 = numpy.array([data[3], data[11], data[15]])
+        ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
+        ref32 = numpy.dot(data0, data1)
+        if (ref64 == ref32):
+            logger.warning("dot3: invalid tests as float32 provides the same result as float64")
+        evt = prg.test_dot3(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype="float64")
+        self.assertEqual(ref64, res, "test_dot3")
+
+        res_d.fill(0)
+        data0 = numpy.array([data[0], data[14]])
+        data1 = numpy.array([data[1], data[15]])
+        ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
+        ref32 = numpy.dot(data0, data1)
+        if (ref64 == ref32):
+            logger.warning("dot2: invalid tests as float32 provides the same result as float64")
+        evt = prg.test_dot2(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype="float64")
+        self.assertEqual(ref64, res, "test_dot2")
diff --git a/src/silx/opencl/test/test_linalg.py b/src/silx/opencl/test/test_linalg.py
new file mode 100644
index 0000000..a997a36
--- /dev/null
+++ b/src/silx/opencl/test/test_linalg.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Test of the linalg module"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Pierre paleo"]
+__license__ = "MIT"
+__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "01/08/2019"
+
+
+import time
+import logging
+import numpy as np
+import unittest
+try:
+    import mako
+except ImportError:
+    mako = None
+from ..common import ocl
+if ocl:
+    import pyopencl as cl
+    import pyopencl.array as parray
+    from .. import linalg
+from silx.test.utils import utilstest
+
+logger = logging.getLogger(__name__)
+try:
+    from scipy.ndimage.filters import laplace
+    _has_scipy = True
+except ImportError:
+    _has_scipy = False
+
+
+# TODO move this function in math or image ?
+def gradient(img):
+    '''
+    Compute the gradient of an image as a numpy array
+    Code from https://github.com/emmanuelle/tomo-tv/
+    '''
+    shape = [img.ndim, ] + list(img.shape)
+    gradient = np.zeros(shape, dtype=img.dtype)
+    slice_all = [0, slice(None, -1),]
+    for d in range(img.ndim):
+        gradient[tuple(slice_all)] = np.diff(img, axis=d)
+        slice_all[0] = d + 1
+        slice_all.insert(1, slice(None))
+    return gradient
+
+
+# TODO move this function in math or image ?
+def divergence(grad):
+    '''
+    Compute the divergence of a gradient
+    Code from https://github.com/emmanuelle/tomo-tv/
+    '''
+    res = np.zeros(grad.shape[1:])
+    for d in range(grad.shape[0]):
+        this_grad = np.rollaxis(grad[d], d)
+        this_res = np.rollaxis(res, d)
+        this_res[:-1] += this_grad[:-1]
+        this_res[1:-1] -= this_grad[:-2]
+        this_res[-1] -= this_grad[-2]
+    return res
+
+
+@unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
+class TestLinAlg(unittest.TestCase):
+
+    def setUp(self):
+        if ocl is None:
+            return
+        self.getfiles()
+        self.la = linalg.LinAlg(self.image.shape)
+        self.allocate_arrays()
+
+    def allocate_arrays(self):
+        """
+        Allocate various types of arrays for the tests
+        """
+        # numpy images
+        self.grad = np.zeros(self.image.shape, dtype=np.complex64)
+        self.grad2 = np.zeros((2,) + self.image.shape, dtype=np.float32)
+        self.grad_ref = gradient(self.image)
+        self.div_ref = divergence(self.grad_ref)
+        self.image2 = np.zeros_like(self.image)
+        # Device images
+        self.gradient_parray = parray.empty(self.la.queue, self.image.shape, np.complex64)
+        self.gradient_parray.fill(0)
+        # we should be using cl.Buffer(self.la.ctx, cl.mem_flags.READ_WRITE, size=self.image.nbytes*2),
+        # but platforms not suporting openCL 1.2 have a problem with enqueue_fill_buffer,
+        # so we use the parray "fill" utility
+        self.gradient_buffer = self.gradient_parray.data
+        # Do the same for image
+        self.image_parray = parray.to_device(self.la.queue, self.image)
+        self.image_buffer = self.image_parray.data
+        # Refs
+        tmp = np.zeros(self.image.shape, dtype=np.complex64)
+        tmp.real = np.copy(self.grad_ref[0])
+        tmp.imag = np.copy(self.grad_ref[1])
+        self.grad_ref_parray = parray.to_device(self.la.queue, tmp)
+        self.grad_ref_buffer = self.grad_ref_parray.data
+
+    def tearDown(self):
+        self.image = None
+        self.image2 = None
+        self.grad = None
+        self.grad2 = None
+        self.grad_ref = None
+        self.div_ref = None
+        self.gradient_parray.data.release()
+        self.gradient_parray = None
+        self.gradient_buffer = None
+        self.image_parray.data.release()
+        self.image_parray = None
+        self.image_buffer = None
+        self.grad_ref_parray.data.release()
+        self.grad_ref_parray = None
+        self.grad_ref_buffer = None
+
+    def getfiles(self):
+        # load 512x512 MRI phantom - TODO include Lena or ascent once a .npz is available
+        self.image = np.load(utilstest.getfile("Brain512.npz"))["data"]
+
+    def compare(self, result, reference, abstol, name):
+        errmax = np.max(np.abs(result - reference))
+        logger.info("%s: Max error = %e" % (name, errmax))
+        self.assertTrue(errmax < abstol, str("%s: Max error is too high" % name))
+
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_gradient(self):
+        arrays = {
+            "numpy.ndarray": self.image,
+            "buffer": self.image_buffer,
+            "parray": self.image_parray
+        }
+        for desc, image in arrays.items():
+            # Test with dst on host (numpy.ndarray)
+            res = self.la.gradient(image, return_to_host=True)
+            self.compare(res, self.grad_ref, 1e-6, str("gradient[src=%s, dst=numpy.ndarray]" % desc))
+            # Test with dst on device (pyopencl.Buffer)
+            self.la.gradient(image, dst=self.gradient_buffer)
+            cl.enqueue_copy(self.la.queue, self.grad, self.gradient_buffer)
+            self.grad2[0] = self.grad.real
+            self.grad2[1] = self.grad.imag
+            self.compare(self.grad2, self.grad_ref, 1e-6, str("gradient[src=%s, dst=buffer]" % desc))
+            # Test with dst on device (pyopencl.Array)
+            self.la.gradient(image, dst=self.gradient_parray)
+            self.grad = self.gradient_parray.get()
+            self.grad2[0] = self.grad.real
+            self.grad2[1] = self.grad.imag
+            self.compare(self.grad2, self.grad_ref, 1e-6, str("gradient[src=%s, dst=parray]" % desc))
+
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_divergence(self):
+        arrays = {
+            "numpy.ndarray": self.grad_ref,
+            "buffer": self.grad_ref_buffer,
+            "parray": self.grad_ref_parray
+        }
+        for desc, grad in arrays.items():
+            # Test with dst on host (numpy.ndarray)
+            res = self.la.divergence(grad, return_to_host=True)
+            self.compare(res, self.div_ref, 1e-6, str("divergence[src=%s, dst=numpy.ndarray]" % desc))
+            # Test with dst on device (pyopencl.Buffer)
+            self.la.divergence(grad, dst=self.image_buffer)
+            cl.enqueue_copy(self.la.queue, self.image2, self.image_buffer)
+            self.compare(self.image2, self.div_ref, 1e-6, str("divergence[src=%s, dst=buffer]" % desc))
+            # Test with dst on device (pyopencl.Array)
+            self.la.divergence(grad, dst=self.image_parray)
+            self.image2 = self.image_parray.get()
+            self.compare(self.image2, self.div_ref, 1e-6, str("divergence[src=%s, dst=parray]" % desc))
+
+    @unittest.skipUnless(ocl and mako and _has_scipy, "pyopencl and/or scipy is missing")
+    def test_laplacian(self):
+        laplacian_ref = laplace(self.image)
+        # Laplacian = div(grad)
+        self.la.gradient(self.image)
+        laplacian_ocl = self.la.divergence(self.la.d_gradient, return_to_host=True)
+        self.compare(laplacian_ocl, laplacian_ref, 1e-6, "laplacian")
diff --git a/src/silx/opencl/test/test_medfilt.py b/src/silx/opencl/test/test_medfilt.py
new file mode 100644
index 0000000..339e0f2
--- /dev/null
+++ b/src/silx/opencl/test/test_medfilt.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Median filter of images + OpenCL
+#             https://github.com/silx-kit/silx
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Simple test of the median filter
+"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "05/07/2018"
+
+
+import sys
+import time
+import logging
+import numpy
+import unittest
+from collections import namedtuple
+try:
+    import mako
+except ImportError:
+    mako = None
+from ..common import ocl
+if ocl:
+    import pyopencl
+    import pyopencl.array
+    from .. import medfilt
+
+logger = logging.getLogger(__name__)
+
+Result = namedtuple("Result", ["size", "error", "sp_time", "oc_time"])
+
+try:
+    from scipy.misc import ascent
+except:
+    def ascent():
+        """Dummy image from random data"""
+        return numpy.random.random((512, 512))
+try:
+    from scipy.ndimage import filters
+    median_filter = filters.median_filter
+    HAS_SCIPY = True
+except:
+    HAS_SCIPY = False
+    from silx.math import medfilt2d as median_filter
+
+@unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
+class TestMedianFilter(unittest.TestCase):
+
+    def setUp(self):
+        if ocl is None:
+            return
+        self.data = ascent().astype(numpy.float32)
+        self.medianfilter = medfilt.MedianFilter2D(self.data.shape, devicetype="gpu")
+
+    def tearDown(self):
+        self.data = None
+        self.medianfilter = None
+
+    def measure(self, size):
+        "Common measurement of accuracy and timings"
+        t0 = time.time()
+        if HAS_SCIPY:
+            ref = median_filter(self.data, size, mode="nearest")
+        else:
+            ref = median_filter(self.data, size)
+        t1 = time.time()
+        try:
+            got = self.medianfilter.medfilt2d(self.data, size)
+        except RuntimeError as msg:
+            logger.error(msg)
+            return
+        t2 = time.time()
+        delta = abs(got - ref).max()
+        return Result(size, delta, t1 - t0, t2 - t1)
+
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_medfilt(self):
+        """
+        tests the median filter kernel
+        """
+        r = self.measure(size=11)
+        if r is None:
+            logger.info("test_medfilt: size: %s: skipped")
+        else:
+            logger.info("test_medfilt: size: %s error %s, t_ref: %.3fs, t_ocl: %.3fs" % r)
+            self.assertEqual(r.error, 0, 'Results are correct')
+
+    def benchmark(self, limit=36):
+        "Run some benchmarking"
+        try:
+            import PyQt5
+            from ...gui.matplotlib import pylab
+            from ...gui.utils import update_fig
+        except:
+            pylab = None
+
+            def update_fig(*ag, **kwarg):
+                pass
+
+        fig = pylab.figure()
+        fig.suptitle("Median filter of an image 512x512")
+        sp = fig.add_subplot(1, 1, 1)
+        sp.set_title(self.medianfilter.ctx.devices[0].name)
+        sp.set_xlabel("Window width & height")
+        sp.set_ylabel("Execution time (s)")
+        sp.set_xlim(2, limit + 1)
+        sp.set_ylim(0, 4)
+        data_size = []
+        data_scipy = []
+        data_opencl = []
+        plot_sp = sp.plot(data_size, data_scipy, "-or", label="scipy")[0]
+        plot_opencl = sp.plot(data_size, data_opencl, "-ob", label="opencl")[0]
+        sp.legend(loc=2)
+        fig.show()
+        update_fig(fig)
+        for s in range(3, limit, 2):
+            r = self.measure(s)
+            print(r)
+            if r.error == 0:
+                data_size.append(s)
+                data_scipy.append(r.sp_time)
+                data_opencl.append(r.oc_time)
+                plot_sp.set_data(data_size, data_scipy)
+                plot_opencl.set_data(data_size, data_opencl)
+                update_fig(fig)
+        fig.show()
+        input()
+
+
+def benchmark():
+    testSuite = unittest.TestSuite()
+    testSuite.addTest(TestMedianFilter("benchmark"))
+    return testSuite
diff --git a/src/silx/opencl/test/test_projection.py b/src/silx/opencl/test/test_projection.py
new file mode 100644
index 0000000..13db5f4
--- /dev/null
+++ b/src/silx/opencl/test/test_projection.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Test of the forward projection module"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Pierre paleo"]
+__license__ = "MIT"
+__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "19/01/2018"
+
+
+import time
+import logging
+import numpy as np
+import unittest
+try:
+    import mako
+except ImportError:
+    mako = None
+from ..common import ocl
+if ocl:
+    from .. import projection
+from silx.test.utils import utilstest
+
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
+class TestProj(unittest.TestCase):
+
+    def setUp(self):
+        if ocl is None:
+            return
+        # ~ if sys.platform.startswith('darwin'):
+            # ~ self.skipTest("Projection is not implemented on CPU for OS X yet")
+        self.getfiles()
+        n_angles = self.sino.shape[0]
+        self.proj = projection.Projection(self.phantom.shape, n_angles)
+        if self.proj.compiletime_workgroup_size < 16 * 16:
+            self.skipTest("Current implementation of OpenCL projection is not supported on this platform yet")
+
+    def tearDown(self):
+        self.phantom = None
+        self.sino = None
+        self.proj = None
+
+    def getfiles(self):
+        # load 512x512 MRI phantom
+        self.phantom = np.load(utilstest.getfile("Brain512.npz"))["data"]
+        # load sinogram computed with PyHST
+        self.sino = np.load(utilstest.getfile("sino500_pyhst.npz"))["data"]
+
+    def measure(self):
+        "Common measurement of timings"
+        t1 = time.time()
+        try:
+            result = self.proj.projection(self.phantom)
+        except RuntimeError as msg:
+            logger.error(msg)
+            return
+        t2 = time.time()
+        return t2 - t1, result
+
+    def compare(self, res):
+        """
+        Compare a result with the reference reconstruction.
+        Only the valid reconstruction zone (inscribed circle) is taken into account
+        """
+        # Compare with the original phantom.
+        # TODO: compare a standard projection
+        ref = self.sino
+        return np.max(np.abs(res - ref))
+
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_proj(self):
+        """
+        tests Projection
+        """
+        # Test single reconstruction
+        # --------------------------
+        t, res = self.measure()
+        if t is None:
+            logger.info("test_proj: skipped")
+        else:
+            logger.info("test_proj: time = %.3fs" % t)
+            err = self.compare(res)
+            msg = str("Max error = %e" % err)
+            logger.info(msg)
+            # Interpolation differs at some lines, giving relative error of 10/50000
+            self.assertTrue(err < 20., "Max error is too high")
+        # Test multiple reconstructions
+        # -----------------------------
+        res0 = np.copy(res)
+        for i in range(10):
+            res = self.proj.projection(self.phantom)
+            errmax = np.max(np.abs(res - res0))
+            self.assertTrue(errmax < 1.e-6, "Max error is too high")
diff --git a/src/silx/opencl/test/test_sparse.py b/src/silx/opencl/test/test_sparse.py
new file mode 100644
index 0000000..1d26b36
--- /dev/null
+++ b/src/silx/opencl/test/test_sparse.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2018-2019 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Test of the sparse module"""
+
+import numpy as np
+import unittest
+import logging
+from itertools import product
+from ..common import ocl
+if ocl:
+    import pyopencl.array as parray
+    from silx.opencl.sparse import CSR
+try:
+    import scipy.sparse as sp
+except ImportError:
+    sp = None
+logger = logging.getLogger(__name__)
+
+
+
+def generate_sparse_random_data(
+    shape=(1000,),
+    data_min=0, data_max=100,
+    density=0.1,
+    use_only_integers=True,
+    dtype="f"):
+    """
+    Generate random sparse data where.
+
+    Parameters
+    ------------
+    shape: tuple
+        Output data shape.
+    data_min: int or float
+        Minimum value of data
+    data_max: int or float
+        Maximum value of data
+    density: float
+        Density of non-zero elements in the output data.
+        Low value of density mean low number of non-zero elements.
+    use_only_integers: bool
+        If set to True, the output data items will be primarily integers,
+        possibly casted to float if dtype is a floating-point type.
+        This can be used for ease of debugging.
+    dtype: str or numpy.dtype
+        Output data type
+    """
+    mask = np.random.binomial(1, density, size=shape)
+    if use_only_integers:
+        d = np.random.randint(data_min, high=data_max, size=shape)
+    else:
+        d = data_min + (data_max - data_min) * np.random.rand(*shape)
+    return (d * mask).astype(dtype)
+
+
+
+@unittest.skipUnless(ocl and sp, "PyOpenCl/scipy is missing")
+class TestCSR(unittest.TestCase):
+    """Test CSR format"""
+
+    def setUp(self):
+        # Test possible configurations
+        input_on_device = [False, True]
+        output_on_device = [False, True]
+        dtypes = [np.float32, np.int32, np.uint16]
+        self._test_configs = list(product(input_on_device, output_on_device, dtypes))
+
+
+    def compute_ref_sparsification(self, array):
+        ref_sparse = sp.csr_matrix(array)
+        return ref_sparse
+
+
+    def test_sparsification(self):
+        for input_on_device, output_on_device, dtype in self._test_configs:
+            self._test_sparsification(input_on_device, output_on_device, dtype)
+
+
+    def _test_sparsification(self, input_on_device, output_on_device, dtype):
+        current_config = "input on device: %s, output on device: %s, dtype: %s" % (
+            str(input_on_device), str(output_on_device), str(dtype)
+        )
+        logger.debug("CSR: %s" % current_config)
+        # Generate data and reference CSR
+        array = generate_sparse_random_data(shape=(512, 511), dtype=dtype)
+        ref_sparse = self.compute_ref_sparsification(array)
+        # Sparsify on device
+        csr = CSR(array.shape, dtype=dtype)
+        if input_on_device:
+            # The array has to be flattened
+            arr = parray.to_device(csr.queue, array.ravel())
+        else:
+            arr = array
+        if output_on_device:
+            d_data = parray.empty_like(csr.data)
+            d_indices = parray.empty_like(csr.indices)
+            d_indptr = parray.empty_like(csr.indptr)
+            d_data.fill(0)
+            d_indices.fill(0)
+            d_indptr.fill(0)
+            output = (d_data, d_indices, d_indptr)
+        else:
+            output = None
+        data, indices, indptr = csr.sparsify(arr, output=output)
+        if output_on_device:
+            data = data.get()
+            indices = indices.get()
+            indptr = indptr.get()
+        # Compare
+        nnz = ref_sparse.nnz
+        self.assertTrue(
+            np.allclose(data[:nnz], ref_sparse.data),
+            "something wrong with sparsified data (%s)"
+            % current_config
+        )
+        self.assertTrue(
+            np.allclose(indices[:nnz], ref_sparse.indices),
+            "something wrong with sparsified indices (%s)"
+            % current_config
+        )
+        self.assertTrue(
+            np.allclose(indptr, ref_sparse.indptr),
+            "something wrong with sparsified indices pointers (indptr) (%s)"
+            % current_config
+        )
+
+
+    def test_desparsification(self):
+        for input_on_device, output_on_device, dtype in self._test_configs:
+            self._test_desparsification(input_on_device, output_on_device, dtype)
+
+
+    def _test_desparsification(self, input_on_device, output_on_device, dtype):
+        current_config = "input on device: %s, output on device: %s, dtype: %s" % (
+            str(input_on_device), str(output_on_device), str(dtype)
+        )
+        logger.debug("CSR: %s" % current_config)
+        # Generate data and reference CSR
+        array = generate_sparse_random_data(shape=(512, 511), dtype=dtype)
+        ref_sparse = self.compute_ref_sparsification(array)
+        # De-sparsify on device
+        csr = CSR(array.shape, dtype=dtype, max_nnz=ref_sparse.nnz)
+        if input_on_device:
+            data = parray.to_device(csr.queue, ref_sparse.data)
+            indices = parray.to_device(csr.queue, ref_sparse.indices)
+            indptr = parray.to_device(csr.queue, ref_sparse.indptr)
+        else:
+            data = ref_sparse.data
+            indices = ref_sparse.indices
+            indptr = ref_sparse.indptr
+        if output_on_device:
+            d_arr = parray.empty_like(csr.array)
+            d_arr.fill(0)
+            output = d_arr
+        else:
+            output = None
+        arr = csr.densify(data, indices, indptr, output=output)
+        if output_on_device:
+            arr = arr.get()
+        # Compare
+        self.assertTrue(
+            np.allclose(arr.reshape(array.shape), array),
+            "something wrong with densified data (%s)"
+            % current_config
+        )
diff --git a/src/silx/opencl/test/test_stats.py b/src/silx/opencl/test/test_stats.py
new file mode 100644
index 0000000..859271d
--- /dev/null
+++ b/src/silx/opencl/test/test_stats.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Sift implementation in Python + OpenCL
+#             https://github.com/silx-kit/silx
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Simple test of an addition
+"""
+__authors__ = ["Henri Payno, Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "19/05/2021"
+
+import logging
+import time
+import numpy
+
+import unittest
+from ..common import ocl
+if ocl:
+    import pyopencl
+    import pyopencl.array
+    from ..statistics import StatResults, Statistics
+from ..utils import get_opencl_code
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipUnless(ocl, "PyOpenCl is missing")
+class TestStatistics(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.size = 1 << 20  # 1 million elements
+        cls.data = numpy.random.randint(0, 65000, cls.size).astype("uint16")
+        fdata = cls.data.astype("float64")
+        t0 = time.perf_counter()
+        std = fdata.std()
+        cls.ref = StatResults(fdata.min(), fdata.max(), float(fdata.size),
+                              fdata.sum(), fdata.mean(), std ** 2,
+                              std)
+        t1 = time.perf_counter()
+        cls.ref_time = t1 - t0
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.size = cls.ref = cls.data = cls.ref_time = None
+
+    @classmethod
+    def validate(cls, res):
+        return (
+            (res.min == cls.ref.min) and
+            (res.max == cls.ref.max) and
+            (res.cnt == cls.ref.cnt) and
+            abs(res.mean - cls.ref.mean) < 0.01 and
+            abs(res.std - cls.ref.std) < 0.1)
+
+    def test_measurement(self):
+        """
+        tests that all devices are working properly ...
+        """
+        logger.info("Reference results: %s", self.ref)
+        for pid, platform in enumerate(ocl.platforms):
+            for did, device in enumerate(platform.devices):
+                try:
+                    s = Statistics(template=self.data, platformid=pid, deviceid=did)
+                except Exception as err:
+                    failed_init = True
+                    res = StatResults(0, 0, 0, 0, 0, 0, 0)
+                    print(err)
+                else:
+                    failed_init = False
+                    for comp in ("single", "double", "comp"):
+                        t0 = time.perf_counter()
+                        res = s(self.data, comp=comp)
+                        t1 = time.perf_counter()
+                        logger.info("Runtime on %s/%s : %.3fms x%.1f", platform, device, 1000 * (t1 - t0), self.ref_time / (t1 - t0))
+
+                        if failed_init or not self.validate(res):
+                            logger.error("failed_init %s; Computation modes %s", failed_init, comp)
+                            logger.error("Failed on platform %s device %s", platform, device)
+                            logger.error("Reference results: %s", self.ref)
+                            logger.error("Faulty results: %s", res)
+                            self.assertTrue(False, f"Stat calculation failed on {platform},{device} in mode {comp}")
diff --git a/src/silx/opencl/utils.py b/src/silx/opencl/utils.py
new file mode 100644
index 0000000..575e018
--- /dev/null
+++ b/src/silx/opencl/utils.py
@@ -0,0 +1,214 @@
+# -*- coding: utf-8 -*-
+# /*##########################################################################
+# Copyright (C) 2017 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ############################################################################*/
+"""
+Project: Sift implementation in Python + OpenCL
+         https://github.com/silx-kit/silx
+"""
+
+from __future__ import division
+
+__authors__ = ["Jérôme Kieffer", "Pierre Paleo"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "06/09/2017"
+__status__ = "Production"
+
+import os
+import numpy
+from .. import resources
+from math import log, ceil
+
+
+def calc_size(shape, blocksize):
+    """
+    Calculate the optimal size for a kernel according to the workgroup size
+    """
+    if "__len__" in dir(blocksize):
+        return tuple((int(i) + int(j) - 1) & ~(int(j) - 1) for i, j in zip(shape, blocksize))
+    else:
+        return tuple((int(i) + int(blocksize) - 1) & ~(int(blocksize) - 1) for i in shape)
+
+
+def nextpower(n):
+    """Calculate the power of two
+
+    :param n: an integer, for example 100
+    :return: another integer, 100-> 128
+    """
+    return 1 << int(ceil(log(n, 2)))
+
+
+def sizeof(shape, dtype="uint8"):
+    """
+    Calculate the number of bytes needed to allocate for a given structure
+
+    :param shape: size or tuple of sizes
+    :param dtype: data type
+    """
+    itemsize = numpy.dtype(dtype).itemsize
+    cnt = 1
+    if "__len__" in dir(shape):
+        for dim in shape:
+            cnt *= dim
+    else:
+        cnt = int(shape)
+    return cnt * itemsize
+
+
+def get_cl_file(resource):
+    """get the full path of a openCL resource file
+
+    The resource name can be prefixed by the name of a resource directory. For
+    example "silx:foo.png" identify the resource "foo.png" from the resource
+    directory "silx".
+    See also :func:`silx.resources.register_resource_directory`.
+
+    :param str resource: Resource name. File name contained if the `opencl`
+        directory of the resources.
+    :return: the full path of the openCL source file
+    """
+    if not resource.endswith(".cl"):
+        resource += ".cl"
+    return resources._resource_filename(resource,
+                                        default_directory="opencl")
+
+
+def read_cl_file(filename):
+    """
+    :param filename: read an OpenCL file and apply a preprocessor
+    :return: preprocessed source code
+    """
+    with open(get_cl_file(filename), "r") as f:
+        # Dummy preprocessor which removes the #include
+        lines = [i for i in f.readlines() if not i.startswith("#include ")]
+    return "".join(lines)
+
+
+get_opencl_code = read_cl_file
+
+
+def concatenate_cl_kernel(filenames):
+    """Concatenates all the kernel from the list of files
+
+    :param filenames: filenames containing the kernels
+    :type filenames: list of str which can be filename of kernel as a string.
+    :return: a string with all kernels concatenated
+
+    this method concatenates all the kernel from the list
+    """
+    return os.linesep.join(read_cl_file(fn) for fn in filenames)
+
+
+
+
+class ConvolutionInfos(object):
+    allowed_axes = {
+        "1D": [None],
+        "separable_2D_1D_2D": [None, (0, 1), (1, 0)],
+        "batched_1D_2D": [(0,), (1,)],
+        "separable_3D_1D_3D": [
+            None,
+            (0, 1, 2),
+            (1, 2, 0),
+            (2, 0, 1),
+            (2, 1, 0),
+            (1, 0, 2),
+            (0, 2, 1)
+        ],
+        "batched_1D_3D": [(0,), (1,), (2,)],
+        "batched_separable_2D_1D_3D": [(0,), (1,), (2,)], # unsupported (?)
+        "2D": [None],
+        "batched_2D_3D": [(0,), (1,), (2,)],
+        "separable_3D_2D_3D": [
+            (1, 0),
+            (0, 1),
+            (2, 0),
+            (0, 2),
+            (1, 2),
+            (2, 1),
+        ],
+        "3D": [None],
+    }
+    use_cases = {
+        (1, 1): {
+            "1D": {
+                "name": "1D convolution on 1D data",
+                "kernels": ["convol_1D_X"],
+            },
+        },
+        (2, 2): {
+            "2D": {
+                "name": "2D convolution on 2D data",
+                "kernels": ["convol_2D_XY"],
+            },
+        },
+        (3, 3): {
+            "3D": {
+                "name": "3D convolution on 3D data",
+                "kernels": ["convol_3D_XYZ"],
+            },
+        },
+        (2, 1): {
+            "separable_2D_1D_2D": {
+                "name": "Separable (2D->1D) convolution on 2D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y"],
+            },
+            "batched_1D_2D": {
+                "name": "Batched 1D convolution on 2D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y"],
+            },
+        },
+        (3, 1): {
+            "separable_3D_1D_3D": {
+                "name": "Separable (3D->1D) convolution on 3D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
+            },
+            "batched_1D_3D": {
+                "name": "Batched 1D convolution on 3D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
+            },
+            "batched_separable_2D_1D_3D": {
+                "name": "Batched separable (2D->1D) convolution on 3D data",
+                "kernels": ["convol_1D_X", "convol_1D_Y", "convol_1D_Z"],
+            },
+        },
+        (3, 2): {
+            "separable_3D_2D_3D": {
+                "name": "Separable (3D->2D) convolution on 3D data",
+                "kernels": ["convol_2D_XY", "convol_2D_XZ", "convol_2D_YZ"],
+            },
+            "batched_2D_3D": {
+                "name": "Batched 2D convolution on 3D data",
+                "kernels": ["convol_2D_XY", "convol_2D_XZ", "convol_2D_YZ"],
+            },
+        },
+    }
+
+
+
+
+
+
+