9 files changed, 1452 insertions, 305 deletions
diff --git a/silx/opencl/backprojection.py b/silx/opencl/backprojection.py
index ca7244e..c257872 100644
--- a/silx/opencl/backprojection.py
+++ b/silx/opencl/backprojection.py
@@ -29,37 +29,30 @@ from __future__ import absolute_import, print_function, with_statement, division
 
 __authors__ = ["A. Mirone, P. Paleo"]
 __license__ = "MIT"
-__date__ = "19/01/2018"
+__date__ = "25/01/2019"
 
 import logging
-import numpy
+import numpy as np
 
 from .common import pyopencl
 from .processing import EventDescription, OpenclProcessing, BufferDescription
-from .utils import nextpower as nextpow2
+from .sinofilter import SinoFilter
+from .sinofilter import fourier_filter as fourier_filter_
+from ..utils.deprecation import deprecated
 
 if pyopencl:
     mf = pyopencl.mem_flags
     import pyopencl.array as parray
 else:
-    raise ImportError("pyopencl is not installed")
+    raise ImportError("Please install pyopencl in order to use opencl backprojection")
 logger = logging.getLogger(__name__)
 
-# put in .common ?
-try:
-    from pyfft.cl import Plan as pyfft_Plan
-    _has_pyfft = True
-except ImportError:
-    _has_pyfft = False
-# For silx v0.6 we disable the use FFT on GPU.
-_has_pyfft = False
-
 
 def _sizeof(Type):
     """
     return the size (in bytes) of a scalar type, like the C behavior
     """
-    return numpy.dtype(Type).itemsize
+    return np.dtype(Type).itemsize
 
 
 def _idivup(a, b):
@@ -69,53 +62,14 @@ def _idivup(a, b):
     return (a + (b - 1)) // b
 
 
-def fourier_filter(sino, filter_=None, fft_size=None):
-    """Simple numpy based implementation of fourier space filter
-    
-    :param sino: of shape shape = (num_projs, num_bins)
-    :param filter: filter function to apply in fourier space
-    :fft_size: size on which perform the fft. May be larger than the sino array 
-    :return: filtered sinogram
-    """
-    assert sino.ndim == 2
-    num_projs, num_bins = sino.shape
-    if fft_size is None:
-        fft_size = nextpow2(num_bins * 2 - 1)
-    else:
-        assert fft_size >= num_bins
-    if fft_size == num_bins:
-        sino_zeropadded = sino.astype(numpy.float32)
-    else:
-        sino_zeropadded = numpy.zeros((num_projs, fft_size),
-                                      dtype=numpy.complex64)
-        sino_zeropadded[:, :num_bins] = sino.astype(numpy.float32)
-
-    if filter_ is None:
-        h = numpy.zeros(fft_size, dtype=numpy.float32)
-        L2 = fft_size // 2 + 1
-        h[0] = 1 / 4.
-        j = numpy.linspace(1, L2, L2 // 2, False)
-        h[1:L2:2] = -1. / (numpy.pi ** 2 * j ** 2)
-        h[L2:] = numpy.copy(h[1:L2 - 1][::-1])
-        filter_ = numpy.fft.fft(h).astype(numpy.complex64)
-
-    # Linear convolution
-    sino_f = numpy.fft.fft(sino, fft_size)
-    sino_f = sino_f * filter_
-    sino_filtered = numpy.fft.ifft(sino_f)[:, :num_bins].real
-    # Send the filtered sinogram to device
-    return numpy.ascontiguousarray(sino_filtered.real, dtype=numpy.float32)
-
-
 class Backprojection(OpenclProcessing):
     """A class for performing the backprojection using OpenCL"""
     kernel_files = ["backproj.cl", "array_utils.cl"]
-    if _has_pyfft:
-        kernel_files.append("backproj_helper.cl")
 
     def __init__(self, sino_shape, slice_shape=None, axis_position=None,
                  angles=None, filter_name=None, ctx=None, devicetype="all",
-                 platformid=None, deviceid=None, profile=False):
+                 platformid=None, deviceid=None, profile=False,
+                 extra_options=None):
         """Constructor of the OpenCL (filtered) backprojection
 
         :param sino_shape: shape of the sinogram. The sinogram is in the format
@@ -139,6 +93,8 @@ class Backprojection(OpenclProcessing):
         :param profile: switch on profiling to be able to profile at the kernel
                         level, store profiling elements (makes code slightly
                         slower)
+        :param extra_options: Advanced extra options in the form of a dict.
+            Current options are: cutoff, use_numpy_fft
         """
         # OS X enforces a workgroup size of 1 when the kernel has
         # synchronization barriers if sys.platform.startswith('darwin'):
@@ -148,10 +104,32 @@ class Backprojection(OpenclProcessing):
         OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
                                   platformid=platformid, deviceid=deviceid,
                                   profile=profile)
-        self.shape = sino_shape
 
-        self.num_bins = numpy.int32(sino_shape[1])
-        self.num_projs = numpy.int32(sino_shape[0])
+        self._init_geometry(sino_shape, slice_shape, angles, axis_position,
+                           extra_options)
+        self._allocate_memory()
+        self._compute_angles()
+        self._init_kernels()
+        self._init_filter(filter_name)
+
+    def _init_geometry(self, sino_shape, slice_shape, angles, axis_position,
+                      extra_options):
+        """Geometry Initialization
+
+        :param sino_shape: shape of the sinogram. The sinogram is in the format
+                           (n_b, n_a) where n_b is the number of detector bins
+                           and n_a is the number of angles.
+        :param slice_shape: shape of the reconstructed slice. By
+                            default, it is a square slice where the dimension
+                            is the "x dimension" of the sinogram (number of
+                            bins).
+        :param angles: list of projection angles in radian.
+        :param axis_position: axis position
+        :param dict extra_options: Advanced extra options
+        """
+        self.shape = sino_shape
+        self.num_bins = np.int32(sino_shape[1])
+        self.num_projs = np.int32(sino_shape[0])
         self.angles = angles
         if slice_shape is None:
             self.slice_shape = (self.num_bins, self.num_bins)
@@ -161,152 +139,161 @@ class Backprojection(OpenclProcessing):
             _idivup(self.slice_shape[0], 32) * 32,
             _idivup(self.slice_shape[1], 32) * 32
         )
-        self.slice = numpy.zeros(self.dimrec_shape, dtype=numpy.float32)
-        self.filter_name = filter_name if filter_name else "Ram-Lak"
         if axis_position:
-            self.axis_pos = numpy.float32(axis_position)
+            self.axis_pos = np.float32(axis_position)
         else:
-            self.axis_pos = numpy.float32((sino_shape[1] - 1.) / 2)
+            self.axis_pos = np.float32((sino_shape[1] - 1.) / 2)
         self.axis_array = None  # TODO: add axis correction front-end
+        self._init_extra_options(extra_options)
 
+    def _init_extra_options(self, extra_options):
+        """Backprojection extra option initialization
+
+        :param dict extra_options: Advanced extra options
+        """
+        self.extra_options = {
+            "cutoff": 1.,
+            "use_numpy_fft": False,
+        }
+        if extra_options is not None:
+            self.extra_options.update(extra_options)
+
+    def _allocate_memory(self):
+        # Host memory
+        self.slice = np.zeros(self.dimrec_shape, dtype=np.float32)
         self.is_cpu = False
         if self.device.type == "CPU":
             self.is_cpu = True
 
-        self.compute_fft_plans()
+        # Device memory
         self.buffers = [
-                       BufferDescription("_d_slice", numpy.prod(self.dimrec_shape), numpy.float32, mf.READ_WRITE),
-                       BufferDescription("d_sino", self.num_projs * self.num_bins, numpy.float32, mf.READ_WRITE),  # before transferring to texture (if available)
-                       BufferDescription("d_cos", self.num_projs, numpy.float32, mf.READ_ONLY),
-                       BufferDescription("d_sin", self.num_projs, numpy.float32, mf.READ_ONLY),
-                       BufferDescription("d_axes", self.num_projs, numpy.float32, mf.READ_ONLY),
-                      ]
-        self.allocate_buffers()
-        if not(self.is_cpu):
-            self.allocate_textures()
-        self.compute_filter()
-        if self.pyfft_plan:
-            self.add_to_cl_mem({
-                "d_filter": self.d_filter,
-                "d_sino_z": self.d_sino_z
-            })
+            BufferDescription("_d_slice", self.dimrec_shape, np.float32, mf.READ_WRITE),
+            BufferDescription("d_sino", self.shape, np.float32, mf.READ_WRITE),  # before transferring to texture (if available)
+            BufferDescription("d_cos", (self.num_projs,), np.float32, mf.READ_ONLY),
+            BufferDescription("d_sin", (self.num_projs,), np.float32, mf.READ_ONLY),
+            BufferDescription("d_axes", (self.num_projs,), np.float32, mf.READ_ONLY),
+        ]
+        self.allocate_buffers(use_array=True)
         self.d_sino = self.cl_mem["d_sino"]  # shorthand
-        self.compute_angles()
 
-        self.local_mem = 256 * 3 * _sizeof(numpy.float32)  # constant for all image sizes
+        # Texture memory (if relevant)
+        if not(self.is_cpu):
+            self._allocate_textures()
+
+        # Local memory
+        self.local_mem = 256 * 3 * _sizeof(np.float32)  # constant for all image sizes
+
+    def _compute_angles(self):
+        if self.angles is None:
+            self.angles = np.linspace(0, np.pi, self.num_projs, False)
+        h_cos = np.cos(self.angles).astype(np.float32)
+        h_sin = np.sin(self.angles).astype(np.float32)
+        self.cl_mem["d_cos"][:] = h_cos[:]
+        self.cl_mem["d_sin"][:] = h_sin[:]
+        if self.axis_array:
+            self.cl_mem["d_axes"][:] = self.axis_array.astype(np.float32)[:]
+        else:
+            self.cl_mem["d_axes"][:] = np.ones(self.num_projs, dtype="f") * self.axis_pos
+
+    def _init_kernels(self):
         OpenclProcessing.compile_kernels(self, self.kernel_files)
         # check that workgroup can actually be (16, 16)
         self.compiletime_workgroup_size = self.kernels.max_workgroup_size("backproj_cpu_kernel")
         # Workgroup and ndrange sizes are always the same
         self.wg = (16, 16)
         self.ndrange = (
-            _idivup(int(self.dimrec_shape[1]), 32) * self.wg[0],  # int(): pyopencl <= 2015.1
-            _idivup(int(self.dimrec_shape[0]), 32) * self.wg[1]  # int(): pyopencl <= 2015.1
+            _idivup(int(self.dimrec_shape[1]), 32) * self.wg[0],
+            _idivup(int(self.dimrec_shape[0]), 32) * self.wg[1]
         )
-
-    def compute_angles(self):
-        if self.angles is None:
-            self.angles = numpy.linspace(0, numpy.pi, self.num_projs, False)
-        h_cos = numpy.cos(self.angles).astype(numpy.float32)
-        h_sin = numpy.sin(self.angles).astype(numpy.float32)
-        pyopencl.enqueue_copy(self.queue, self.cl_mem["d_cos"], h_cos)
-        pyopencl.enqueue_copy(self.queue, self.cl_mem["d_sin"], h_sin)
-        if self.axis_array:
-            pyopencl.enqueue_copy(self.queue,
-                                  self.cl_mem["d_axes"],
-                                  self.axis_array.astype(numpy.float32))
+        # Prepare arguments for the kernel call
+        if self.is_cpu:
+            d_sino_ref = self.d_sino.data
         else:
-            pyopencl.enqueue_copy(self.queue,
-                                  self.cl_mem["d_axes"],
-                                  numpy.ones(self.num_projs, dtype=numpy.float32) * self.axis_pos)
+            d_sino_ref = self.d_sino_tex
+        self._backproj_kernel_args = (
+            # num of projections (int32)
+            self.num_projs,
+            # num of bins (int32)
+            self.num_bins,
+            # axis position (float32)
+            self.axis_pos,
+            # d_slice (__global float32*)
+            self.cl_mem["_d_slice"].data,
+            # d_sino (__read_only image2d_t or float*)
+            d_sino_ref,
+            # gpu_offset_x (float32) # TODO custom ?
+            -np.float32((self.num_bins - 1) / 2. - self.axis_pos),
+            # gpu_offset_y (float32) # TODO custom ?
+            -np.float32((self.num_bins - 1) / 2. - self.axis_pos),
+            # d_cos (__global float32*)
+            self.cl_mem["d_cos"].data,
+            # d_sin (__global float32*)
+            self.cl_mem["d_sin"].data,
+            # d_axis  (__global float32*)
+            self.cl_mem["d_axes"].data,
+            # shared mem (__local float32*)
+            self._get_local_mem()
+        )
 
-    def allocate_textures(self):
+    def _allocate_textures(self):
         """
         Allocate the texture for the sinogram.
         """
         self.d_sino_tex = pyopencl.Image(
-                                        self.ctx,
-                                        mf.READ_ONLY | mf.USE_HOST_PTR,
-                                        pyopencl.ImageFormat(
-                                                             pyopencl.channel_order.INTENSITY,
-                                                             pyopencl.channel_type.FLOAT
-                                                            ),
-                                        hostbuf=numpy.zeros(self.shape[::-1], dtype=numpy.float32)
-                                        )
-
-    def compute_fft_plans(self):
-        """
-        If pyfft is installed, prepare a batched 1D FFT plan for the filtering
-        of FBP
+            self.ctx,
+            mf.READ_ONLY | mf.USE_HOST_PTR,
+            pyopencl.ImageFormat(
+                pyopencl.channel_order.INTENSITY,
+                pyopencl.channel_type.FLOAT
+            ),
+            hostbuf=np.zeros(self.shape[::-1], dtype=np.float32)
+        )
 
-        """
-        self.fft_size = nextpow2(self.num_bins * 2 - 1)
-        if _has_pyfft:
-            logger.debug("pyfft is available. Computing FFT plans...")
-            # batched 1D transform
-            self.pyfft_plan = pyfft_Plan(self.fft_size, queue=self.queue,
-                                         wait_for_finish=True)
-            self.d_sino_z = parray.zeros(self.queue,
-                                         (self.num_projs, self.fft_size),
-                                         dtype=numpy.complex64)
-            logger.debug("... done")
-        else:
-            logger.debug("pyfft not available, using numpy.fft")
-            self.pyfft_plan = None
-            # TODO: fall-back to fftw if present ?
+    def _init_filter(self, filter_name):
+        """Filter initialization
 
-    def compute_filter(self):
-        """
-        Compute the filter for FBP
+        :param str filter_name: filter name
         """
-        if self.filter_name == "Ram-Lak":
-            L = self.fft_size
-            h = numpy.zeros(L, dtype=numpy.float32)
-            L2 = L // 2 + 1
-            h[0] = 1 / 4.
-            j = numpy.linspace(1, L2, L2 // 2, False)
-            h[1:L2:2] = -1. / (numpy.pi ** 2 * j ** 2)
-            h[L2:] = numpy.copy(h[1:L2 - 1][::-1])
-        else:
-            # TODO: other filters
-            raise ValueError("Filter %s is not available" % self.filter_name)
-        self.filter = h
-        if self.pyfft_plan:
-            self.d_filter = parray.to_device(self.queue, h.astype(numpy.complex64))
-            self.pyfft_plan.execute(self.d_filter.data)
-        else:
-            self.filter = numpy.fft.fft(h).astype(numpy.complex64)
-            self.d_filter = None
+        self.filter_name = filter_name or "ram-lak"
+        self.sino_filter = SinoFilter(
+            self.shape,
+            ctx=self.ctx,
+            filter_name=self.filter_name,
+            extra_options=self.extra_options,
+        )
 
     def _get_local_mem(self):
         return pyopencl.LocalMemory(self.local_mem)  # constant for all image sizes
 
-    def cpy2d_to_slice(self, dst):
-        ndrange = (int(self.slice_shape[1]), int(self.slice_shape[0]))  # pyopencl < 2015.2
-        slice_shape_ocl = numpy.int32(ndrange)
+    def _cpy2d_to_slice(self, dst):
+        ndrange = (int(self.slice_shape[1]), int(self.slice_shape[0]))
+        slice_shape_ocl = np.int32(ndrange)
         wg = None
         kernel_args = (
             dst.data,
-            self.cl_mem["_d_slice"],
-            numpy.int32(self.slice_shape[1]),
-            numpy.int32(self.dimrec_shape[1]),
-            numpy.int32((0, 0)),
-            numpy.int32((0, 0)),
+            self.cl_mem["_d_slice"].data,
+            np.int32(self.slice_shape[1]),
+            np.int32(self.dimrec_shape[1]),
+            np.int32((0, 0)),
+            np.int32((0, 0)),
             slice_shape_ocl
         )
         return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args)
 
-    def transfer_to_texture(self, sino):
+    def _transfer_to_texture(self, sino):
+        if isinstance(sino, parray.Array):
+            return self._transfer_device_to_texture(sino)
         sino2 = sino
-        if not(sino.flags["C_CONTIGUOUS"] and sino.dtype == numpy.float32):
-            sino2 = numpy.ascontiguousarray(sino, dtype=numpy.float32)
+        if not(sino.flags["C_CONTIGUOUS"] and sino.dtype == np.float32):
+            sino2 = np.ascontiguousarray(sino, dtype=np.float32)
         if self.is_cpu:
             ev = pyopencl.enqueue_copy(
                                         self.queue,
-                                        self.d_sino,
+                                        self.d_sino.data,
                                         sino2
                                         )
             what = "transfer filtered sino H->D buffer"
+            ev.wait()
         else:
             ev = pyopencl.enqueue_copy(
                                        self.queue,
@@ -318,21 +305,22 @@ class Backprojection(OpenclProcessing):
             what = "transfer filtered sino H->D texture"
         return EventDescription(what, ev)
 
-    def transfer_device_to_texture(self, d_sino):
+    def _transfer_device_to_texture(self, d_sino):
         if self.is_cpu:
             if id(self.d_sino) == id(d_sino):
                 return
             ev = pyopencl.enqueue_copy(
                                        self.queue,
-                                       self.d_sino,
+                                       self.d_sino.data,
                                        d_sino
                                        )
             what = "transfer filtered sino D->D buffer"
+            ev.wait()
         else:
             ev = pyopencl.enqueue_copy(
                                        self.queue,
                                        self.d_sino_tex,
-                                       d_sino,
+                                       d_sino.data,
                                        offset=0,
                                        origin=(0, 0),
                                        region=self.shape[::-1]
@@ -340,65 +328,35 @@ class Backprojection(OpenclProcessing):
             what = "transfer filtered sino D->D texture"
         return EventDescription(what, ev)
 
-    def backprojection(self, sino=None, dst=None):
+    def backprojection(self, sino, output=None):
         """Perform the backprojection on an input sinogram
 
-        :param sino: sinogram. If provided, it returns the plain backprojection.
-        :param dst: destination (pyopencl.Array). If provided, the result will be written in this array.
+        :param sino: sinogram.
+        :param output: optional, output slice.
+            If provided, the result will be written in this array.
         :return: backprojection of sinogram
         """
         events = []
         with self.sem:
-
-            if sino is not None:  # assuming numpy.ndarray
-                events.append(self.transfer_to_texture(sino))
-            # Prepare arguments for the kernel call
-            if self.is_cpu:
-                d_sino_ref = self.d_sino
-            else:
-                d_sino_ref = self.d_sino_tex
-            kernel_args = (
-                self.num_projs,  # num of projections (int32)
-                self.num_bins,  # num of bins (int32)
-                self.axis_pos,  # axis position (float32)
-                self.cl_mem["_d_slice"],  # d_slice (__global float32*)
-                d_sino_ref,  # d_sino (__read_only image2d_t or float*)
-                numpy.float32(0),  # gpu_offset_x (float32)
-                numpy.float32(0),  # gpu_offset_y (float32)
-                self.cl_mem["d_cos"],  # d_cos (__global float32*)
-                self.cl_mem["d_sin"],  # d_sin (__global float32*)
-                self.cl_mem["d_axes"],  # d_axis  (__global float32*)
-                self._get_local_mem()  # shared mem (__local float32*)
-            )
-            # Call the kernel
+            events.append(self._transfer_to_texture(sino))
+            # Call the backprojection kernel
             if self.is_cpu:
                 kernel_to_call = self.kernels.backproj_cpu_kernel
             else:
                 kernel_to_call = self.kernels.backproj_kernel
-            event_bpj = kernel_to_call(
+            kernel_to_call(
                 self.queue,
                 self.ndrange,
                 self.wg,
-                *kernel_args
+                *self._backproj_kernel_args
             )
-            if dst is None:
-                self.slice[:] = 0
-                events.append(EventDescription("backprojection", event_bpj))
-                ev = pyopencl.enqueue_copy(self.queue, self.slice,
-                                           self.cl_mem["_d_slice"])
-                events.append(EventDescription("copy D->H result", ev))
-                ev.wait()
-                res = numpy.copy(self.slice)
-                if self.dimrec_shape[0] > self.slice_shape[0] or self.dimrec_shape[1] > self.slice_shape[1]:
-                    res = res[:self.slice_shape[0], :self.slice_shape[1]]
-                # if the slice is backprojected onto a bigger grid
-                if self.slice_shape[1] > self.num_bins:
-                    res = res[:self.slice_shape[0], :self.slice_shape[1]]
+            # Return
+            if output is None:
+                res = self.cl_mem["_d_slice"].get()
+                res = res[:self.slice_shape[0], :self.slice_shape[1]]
             else:
-                ev = self.cpy2d_to_slice(dst)
-                events.append(EventDescription("copy D->D result", ev))
-                ev.wait()
-                res = dst
+                res = output
+                self._cpy2d_to_slice(output)
 
         # /with self.sem
         if self.profile:
@@ -406,83 +364,34 @@ class Backprojection(OpenclProcessing):
 
         return res
 
-    def filter_projections(self, sino, rescale=True):
-        """
-        Performs the FBP on a given sinogram.
-
-        :param sinogram: sinogram to (filter-)backproject
-        :param rescale: if True (default), the sinogram is multiplied with
-                        (pi/n_projs)
-        """
-        if sino.shape[0] != self.num_projs or sino.shape[1] != self.num_bins:
-            raise ValueError("Expected sinogram with (projs, bins) = (%d, %d)" % (self.num_projs, self.num_bins))
-        if rescale:
-            sino = sino * numpy.pi / self.num_projs
-        events = []
-        # if pyfft is available, all can be done on the device
-        if self.d_filter is not None:
-
-            # Zero-pad the sinogram.
-            # TODO: this can be done on GPU with a "Memcpy2D":
-            #  cl.enqueue_copy(queue, dst, src, host_origin=(0,0), buffer_origin=(0,0), region=shape, host_pitches=(sino.shape[1],), buffer_pitches=(self.fft_size,))
-            # However it does not work properly, and raises an error for pyopencl < 2017.1
-            sino_zeropadded = numpy.zeros((sino.shape[0], self.fft_size), dtype=numpy.complex64)
-            sino_zeropadded[:, :self.num_bins] = sino.astype(numpy.float32)
-            sino_zeropadded = numpy.ascontiguousarray(sino_zeropadded, dtype=numpy.complex64)
-            with self.sem:
-                # send to GPU
-                ev = pyopencl.enqueue_copy(self.queue, self.d_sino_z.data, sino_zeropadded)
-                events.append(EventDescription("Send sino H->D", ev))
-
-                # FFT (in-place)
-                self.pyfft_plan.execute(self.d_sino_z.data, batch=self.num_projs)
-
-                # Multiply (complex-wise) with the the filter
-                ev = self.kernels.mult(self.queue,
-                                       tuple(int(i) for i in self.d_sino_z.shape[::-1]),
-                                       None,
-                                       self.d_sino_z.data,
-                                       self.d_filter.data,
-                                       numpy.int32(self.fft_size),
-                                       self.num_projs
-                                       )
-                events.append(EventDescription("complex 2D-1D multiplication", ev))
-                # Inverse FFT (in-place)
-                self.pyfft_plan.execute(self.d_sino_z.data, batch=self.num_projs, inverse=True)
-                # Copy the real part of d_sino_z[:, :self.num_bins] (complex64) to d_sino (float32)
-                ev = self.kernels.cpy2d_c2r(self.queue, self.shape[::-1], None,
-                                            self.d_sino,
-                                            self.d_sino_z.data,
-                                            self.num_bins,
-                                            self.num_projs,
-                                            numpy.int32(self.fft_size)
-                                            )
-                events.append(EventDescription("conversion from complex padded sinogram to sinogram", ev))
-                # debug
-#                 ev.wait()
-#                 h_sino = numpy.zeros(sino.shape, dtype=numpy.float32)
-#                 ev = pyopencl.enqueue_copy(self.queue, h_sino, self.d_sino)
-#                 ev.wait()
-#                 numpy.save("/tmp/filtered_sinogram_%s.npy" % self.ctx.devices[0].platform.name.split()[0], h_sino)
-                events.append(self.transfer_device_to_texture(self.d_sino))
-            # ------
-        else:  # no pyfft
-            sino_filtered = fourier_filter(sino, filter_=self.filter, fft_size=self.fft_size)
-            with self.sem:
-                events.append(self.transfer_to_texture(sino_filtered))
-        if self.profile:
-            self.events += events
-
-    def filtered_backprojection(self, sino):
+    def filtered_backprojection(self, sino, output=None):
         """
         Compute the filtered backprojection (FBP) on a sinogram.
 
-        :param sino: sinogram (`numpy.ndarray`) in the format (projections,
-                     bins)
+        :param sino: sinogram (`np.ndarray` or `pyopencl.array.Array`)
+            with the shape (n_projections, n_bins)
+        :param output: output (`np.ndarray` or `pyopencl.array.Array`).
+            If nothing is provided, a new numpy array is returned.
         """
-
-        self.filter_projections(sino)
-        res = self.backprojection()
+        # Filter
+        self.sino_filter(sino, output=self.d_sino)
+        # Backproject
+        res = self.backprojection(self.d_sino, output=output)
         return res
 
     __call__ = filtered_backprojection
+
+
+    # -------------------
+    # - Compatibility  -
+    # -------------------
+
+    @deprecated(replacement="Backprojection.sino_filter", since_version="0.10")
+    def filter_projections(self, sino, rescale=True):
+        self.sino_filter(sino, output=self.d_sino)
+
+
+
+def fourier_filter(sino, filter_=None, fft_size=None):
+    return fourier_filter_(sino, filter_=filter_, fft_size=fft_size)
+
diff --git a/silx/opencl/codec/test/test_byte_offset.py b/silx/opencl/codec/test/test_byte_offset.py
index 2bfa1d3..9ce1cfc 100644
--- a/silx/opencl/codec/test/test_byte_offset.py
+++ b/silx/opencl/codec/test/test_byte_offset.py
@@ -45,16 +45,13 @@ import logging
 import numpy
 from silx.opencl.common import ocl, pyopencl
 from silx.opencl.codec import byte_offset
-try:
-    import fabio
-except ImportError:
-    fabio = None
+import fabio
 import unittest
 logger = logging.getLogger(__name__)
 
 
-@unittest.skipUnless(ocl and fabio and pyopencl,
-                     "PyOpenCl or fabio is missing")
+@unittest.skipUnless(ocl and pyopencl,
+                     "PyOpenCl is missing")
 class TestByteOffset(unittest.TestCase):
 
     @staticmethod
diff --git a/silx/opencl/processing.py b/silx/opencl/processing.py
index 250582d..045a9d3 100644
--- a/silx/opencl/processing.py
+++ b/silx/opencl/processing.py
@@ -41,7 +41,7 @@ __author__ = "Jerome Kieffer"
 __contact__ = "Jerome.Kieffer@ESRF.eu"
 __license__ = "MIT"
 __copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "27/02/2018"
+__date__ = "11/01/2019"
 __status__ = "stable"
 
 
@@ -53,6 +53,7 @@ import numpy
 import threading
 from .common import ocl, pyopencl, release_cl_buffers, kernel_workgroup_size
 from .utils import concatenate_cl_kernel
+import platform
 
 
 BufferDescription = namedtuple("BufferDescription", ["name", "size", "dtype", "flags"])
@@ -124,6 +125,7 @@ class OpenclProcessing(object):
                          level, store profiling elements (makes code slightly slower)
         """
         self.sem = threading.Semaphore()
+        self._X87_VOLATILE = None
         self.profile = None
         self.events = []  # List with of EventDescription, kept for profiling
         self.cl_mem = {}  # dict with all buffer allocated
@@ -176,7 +178,6 @@ class OpenclProcessing(object):
         have a built-in way to check the actual free memory on a
         device, only the total memory.
         """
-
         if buffers is None:
             buffers = self.buffers
 
@@ -255,7 +256,7 @@ class OpenclProcessing(object):
         kernel_files = kernel_files or self.kernel_files
         kernel_src = concatenate_cl_kernel(kernel_files)
 
-        compile_options = compile_options or ""
+        compile_options = compile_options or self.get_compiler_options()
         logger.info("Compiling file %s with options %s", kernel_files, compile_options)
         try:
             self.program = pyopencl.Program(self.ctx, kernel_src).build(options=compile_options)
@@ -313,6 +314,29 @@ class OpenclProcessing(object):
         with self.sem:
             self.events = []
 
+    @property
+    def x87_volatile_option(self):
+        # this is running 32 bits OpenCL woth POCL
+        if self._X87_VOLATILE is None:
+            if (platform.machine() in ("i386", "i686", "x86_64", "AMD64") and
+                    (tuple.__itemsize__ == 4) and
+                    self.ctx.devices[0].platform.name == 'Portable Computing Language'):
+                self._X87_VOLATILE = "-DX87_VOLATILE=volatile"
+            else:
+                self._X87_VOLATILE = ""
+        return self._X87_VOLATILE
+
+    def get_compiler_options(self, x87_volatile=False):
+        """Provide the default OpenCL compiler options
+        
+        :param x87_volatile: needed for Kahan summation
+        :return: string with compiler option
+        """
+        option_list = []
+        if x87_volatile:
+            option_list.append(self.x87_volatile_option)
+        return " ".join(i for i in option_list if i)
+
 # This should be implemented by concrete class
 #     def __copy__(self):
 #         """Shallow copy of the object
diff --git a/silx/opencl/sinofilter.py b/silx/opencl/sinofilter.py
new file mode 100644
index 0000000..3e4d69c
--- /dev/null
+++ b/silx/opencl/sinofilter.py
@@ -0,0 +1,559 @@
+#!/usr/bin/env python
+# coding: utf-8
+# /*##########################################################################
+#
+# Copyright (c) 2016 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ###########################################################################*/
+"""Module for sinogram filtering on CPU/GPU."""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+__authors__ = ["P. Paleo"]
+__license__ = "MIT"
+__date__ = "01/02/2019"
+
+import numpy as np
+from math import pi
+from itertools import product
+from bisect import bisect
+
+from .common import pyopencl as cl
+import pyopencl.array as parray
+from .processing import OpenclProcessing
+from ..math.fft import FFT
+from ..math.fft.clfft import __have_clfft__
+from ..utils.deprecation import deprecated
+
+
+def nextpow2(N):
+    p = 1
+    while p < N:
+        p *= 2
+    return p
+
+def compute_ramlak_filter(dwidth_padded, dtype=np.float32):
+    """
+    Compute the Ramachandran-Lakshminarayanan (Ram-Lak) filter, used in
+    filtered backprojection.
+
+    :param dwidth_padded: width of the 2D sinogram after padding
+    :param dtype: data type
+    """
+    #~ dwidth_padded = dwidth * 2
+    L = dwidth_padded
+    h = np.zeros(L, dtype=dtype)
+    L2 = L//2+1
+    h[0] = 1/4.
+    j = np.linspace(1, L2, L2//2, False).astype(dtype) # np < 1.9.0
+    # h[2::2] = 0
+    h[1:L2:2] = -1./(pi**2 * j**2)
+    # h[-1:L2-1:-2] = -1./(pi**2 * j**2)
+    h[L2:] = np.copy(h[1:L2-1][::-1])
+    return h
+
+
+def tukey(N, alpha=0.5):
+    """
+    Compute the Tukey apodization window.
+
+    :param int N: Number of points.
+    :param float alpha:
+    """
+    apod = np.zeros(N)
+    x = np.arange(N)/(N-1)
+    r = alpha
+    M1 = (0 <= x) * (x < r/2)
+    M2 = (r/2 <= x) * (x <= 1 - r/2)
+    M3 = (1 - r/2 < x) * (x <= 1)
+    apod[M1] = (1 + np.cos(2*pi/r * (x[M1] - r/2)))/2.
+    apod[M2] = 1.
+    apod[M3] = (1 + np.cos(2*pi/r * (x[M3] - 1 + r/2)))/2.
+    return apod
+
+
+def lanczos(N):
+    """
+    Compute the Lanczos window (truncated sinc) of width N.
+
+    :param int N: window width
+    """
+    x = np.arange(N)/(N-1)
+    return np.sin(pi*(2*x-1))/(pi*(2*x-1))
+
+
+def compute_fourier_filter(dwidth_padded, filter_name, cutoff=1.):
+    """
+    Compute the filter used for FBP.
+
+    :param dwidth_padded: padded detector width. As the filtering is done by the
+        Fourier convolution theorem, dwidth_padded should be at least 2*dwidth.
+    :param filter_name: Name of the filter. Available filters are:
+        Ram-Lak, Shepp-Logan, Cosine, Hamming, Hann, Tukey, Lanczos.
+    :param cutoff: Cut-off frequency, if relevant.
+    """
+    Nf = dwidth_padded
+    #~ filt_f = np.abs(np.fft.fftfreq(Nf))
+    rl = compute_ramlak_filter(Nf, dtype=np.float64)
+    filt_f = np.fft.fft(rl)
+
+    filter_name = filter_name.lower()
+    if filter_name in ["ram-lak", "ramlak"]:
+        return filt_f
+
+    w = 2 * pi * np.fft.fftfreq(dwidth_padded)
+    d = cutoff
+    apodization = {
+        # ~OK
+        "shepp-logan": np.sin(w[1:Nf]/(2*d))/(w[1:Nf]/(2*d)),
+        # ~OK
+        "cosine": np.cos(w[1:Nf]/(2*d)),
+        # OK
+        "hamming": 0.54*np.ones_like(filt_f)[1:Nf] + .46 * np.cos(w[1:Nf]/d),
+        # OK
+        "hann": (np.ones_like(filt_f)[1:Nf] + np.cos(w[1:Nf]/d))/2.,
+        # These one is not compatible with Astra - TODO investigate why
+        "tukey": np.fft.fftshift(tukey(dwidth_padded, alpha=d/2.))[1:Nf],
+        "lanczos": np.fft.fftshift(lanczos(dwidth_padded))[1:Nf],
+    }
+    if filter_name not in apodization:
+        raise ValueError("Unknown filter %s. Available filters are %s" %
+                         (filter_name, str(apodization.keys())))
+    filt_f[1:Nf] *= apodization[filter_name]
+    return filt_f
+
+
+def generate_powers():
+    """
+    Generate a list of powers of [2, 3, 5, 7],
+    up to (2**15)*(3**9)*(5**6)*(7**5).
+    """
+    primes = [2, 3, 5, 7]
+    maxpow = {2: 15, 3: 9, 5: 6, 7: 5}
+    valuations = []
+    for prime in primes:
+        valuations.append(range(maxpow[prime]+1))
+    powers = product(*valuations)
+    res = []
+    for pw in powers:
+        res.append(np.prod(list(map(lambda x : x[0]**x[1], zip(primes, pw)))))
+    return np.unique(res)
+
+
+
+class SinoFilter(OpenclProcessing):
+    """
+    A class for performing sinogram filtering on GPU using OpenCL.
+    This is a convolution in the Fourier space, along one dimension:
+      - In 2D: (n_a, d_x): n_a filterings (1D FFT of size d_x)
+      - In 3D: (n_z, n_a, d_x): n_z*n_a filterings (1D FFT of size d_x)
+    """
+    kernel_files = ["array_utils.cl"]
+    powers = generate_powers()
+
+    def __init__(self, sino_shape, filter_name=None, ctx=None,
+                 devicetype="all", platformid=None, deviceid=None,
+                 profile=False, extra_options=None):
+        """Constructor of OpenCL FFT-Convolve.
+
+        :param sino_shape: shape of the sinogram.
+        :param filter_name: Name of the filter. Defaut is "ram-lak".
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by
+                           clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param profile: switch on profiling to be able to profile at the kernel
+                        level, store profiling elements (makes code slightly
+                        slower)
+        :param dict extra_options: Advanced extra options.
+            Current options are: cutoff, use_numpy_fft
+        """
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  profile=profile)
+
+        self._init_extra_options(extra_options)
+        self._calculate_shapes(sino_shape)
+        self._init_fft()
+        self._allocate_memory()
+        self._compute_filter(filter_name)
+        self._init_kernels()
+
+    def _calculate_shapes(self, sino_shape):
+        """
+
+        :param sino_shape: shape of the sinogram.
+        """
+        self.ndim = len(sino_shape)
+        if self.ndim == 2:
+            n_angles, dwidth = sino_shape
+        else:
+            raise ValueError("Invalid sinogram number of dimensions: "
+                             "expected 2 dimensions")
+        self.sino_shape = sino_shape
+        self.n_angles = n_angles
+        self.dwidth = dwidth
+        self.dwidth_padded = self.get_next_power(2*self.dwidth)
+        self.sino_padded_shape = (n_angles, self.dwidth_padded)
+        sino_f_shape = list(self.sino_padded_shape)
+        sino_f_shape[-1] = sino_f_shape[-1]//2+1
+        self.sino_f_shape = tuple(sino_f_shape)
+
+    def _init_extra_options(self, extra_options):
+        """
+
+        :param dict extra_options: Advanced extra options.
+            Current options are: cutoff,
+        """
+        self.extra_options = {
+            "cutoff": 1.,
+            "use_numpy_fft": False,
+        }
+        if extra_options is not None:
+            self.extra_options.update(extra_options)
+
+    def _init_fft(self):
+        if __have_clfft__ and not(self.extra_options["use_numpy_fft"]):
+            self.fft_backend = "opencl"
+            self.fft = FFT(
+                self.sino_padded_shape,
+                dtype=np.float32,
+                axes=(-1,),
+                backend="opencl",
+                ctx=self.ctx,
+            )
+        else:
+            self.fft_backend = "numpy"
+            print("The gpyfft module was not found. The Fourier transforms "
+                  "will be done on CPU. For more performances, it is advised "
+                  "to install gpyfft.""")
+            self.fft = FFT(
+                template=np.zeros(self.sino_padded_shape, "f"),
+                axes=(-1,),
+                backend="numpy",
+            )
+
+    def _allocate_memory(self):
+        self.d_filter_f = parray.zeros(self.queue, (self.sino_f_shape[-1],), np.complex64)
+        self.is_cpu = (self.device.type == "CPU")
+        # These are already allocated by FFT() if using the opencl backend
+        if self.fft_backend == "opencl":
+            self.d_sino_padded = self.fft.data_in
+            self.d_sino_f = self.fft.data_out
+        else:
+            # When using the numpy backend, arrays are not pre-allocated
+            self.d_sino_padded = np.zeros(self.sino_padded_shape, "f")
+            self.d_sino_f = np.zeros(self.sino_f_shape, np.complex64)
+        # These are needed for rectangular memcpy in certain cases (see below).
+        self.tmp_sino_device = parray.zeros(self.queue, self.sino_shape, "f")
+        self.tmp_sino_host = np.zeros(self.sino_shape, "f")
+
+    def get_next_power(self, n):
+        """
+        Given a number, get the closest (upper) number p such that
+        p is a power of 2, 3, 5 and 7.
+        """
+        idx = bisect(self.powers, n)
+        if self.powers[idx-1] == n:
+            return n
+        return self.powers[idx]
+
+    def _compute_filter(self, filter_name):
+        """
+
+        :param str filter_name: filter name
+        """
+        self.filter_name = filter_name or "ram-lak"
+        filter_f = compute_fourier_filter(
+            self.dwidth_padded,
+            filter_name,
+            cutoff=self.extra_options["cutoff"],
+        )[:self.dwidth_padded//2+1]  # R2C
+        self.set_filter(filter_f, normalize=True)
+
+    def set_filter(self, h_filt, normalize=True):
+        """
+        Set a filter for sinogram filtering.
+
+        :param h_filt: Filter. Each line of the sinogram will be filtered with
+            this filter. It has to be the Real-to-Complex Fourier Transform
+            of some real filter, padded to 2*sinogram_width.
+        :param normalize: Whether to normalize the filter with pi/num_angles.
+        """
+        if h_filt.size != self.sino_f_shape[-1]:
+            raise ValueError(
+                """
+                Invalid filter size: expected %d, got %d.
+                Please check that the filter is the Fourier R2C transform of
+                some real 1D filter.
+                """
+                % (self.sino_f_shape[-1], h_filt.size)
+            )
+        if not(np.iscomplexobj(h_filt)):
+            print("Warning: expected a complex Fourier filter")
+        self.filter_f = h_filt
+        if normalize:
+            self.filter_f *= pi/self.n_angles
+        self.filter_f = self.filter_f.astype(np.complex64)
+        self.d_filter_f[:] = self.filter_f[:]
+
+    def _init_kernels(self):
+        OpenclProcessing.compile_kernels(self, self.kernel_files)
+        h, w = self.d_sino_f.shape
+        self.mult_kern_args = (
+            self.queue,
+            np.int32(self.d_sino_f.shape[::-1]),
+            None,
+            self.d_sino_f.data,
+            self.d_filter_f.data,
+            np.int32(w),
+            np.int32(h)
+        )
+
+    def check_array(self, arr):
+        if arr.dtype != np.float32:
+            raise ValueError("Expected data type = numpy.float32")
+        if arr.shape != self.sino_shape:
+            raise ValueError("Expected sinogram shape %s, got %s" %
+                             (self.sino_shape, arr.shape))
+        if not(isinstance(arr, np.ndarray) or isinstance(arr, parray.Array)):
+            raise ValueError("Expected either numpy.ndarray or "
+                             "pyopencl.array.Array")
+
+    def copy2d(self, dst, src, transfer_shape, dst_offset=(0, 0),
+               src_offset=(0, 0)):
+        """
+
+        :param dst:
+        :param src:
+        :param transfer_shape:
+        :param dst_offset:
+        :param src_offset:
+        """
+        self.kernels.cpy2d(
+            self.queue,
+            np.int32(transfer_shape[::-1]),
+            None,
+            dst.data,
+            src.data,
+            np.int32(dst.shape[1]),
+            np.int32(src.shape[1]),
+            np.int32(dst_offset),
+            np.int32(src_offset),
+            np.int32(transfer_shape[::-1])
+        )
+
+    def copy2d_host(self, dst, src, transfer_shape, dst_offset=(0, 0),
+                    src_offset=(0, 0)):
+        """
+
+        :param dst:
+        :param src:
+        :param transfer_shape:
+        :param dst_offset:
+        :param src_offset:
+        """
+        s = transfer_shape
+        do = dst_offset
+        so = src_offset
+        dst[do[0]:do[0]+s[0], do[1]:do[1]+s[1]] = src[so[0]:so[0]+s[0], so[1]:so[1]+s[1]]
+
+    def _prepare_input_sino(self, sino):
+        """
+
+        :param sino: sinogram
+        """
+        self.check_array(sino)
+        self.d_sino_padded.fill(0)
+        if self.fft_backend == "opencl":
+            # OpenCL backend: FFT/mult/IFFT are done on device.
+            if isinstance(sino, np.ndarray):
+                # OpenCL backend + numpy input: copy H->D.
+                # As pyopencl does not support rectangular copies, we have to
+                # do a copy H->D in a temporary device buffer, and then call a
+                # kernel doing the rectangular D-D copy.
+                self.tmp_sino_device[:] = sino[:]
+                if self.is_cpu:
+                    self.tmp_sino_device.finish()
+                d_sino_ref = self.tmp_sino_device
+            else:
+                d_sino_ref = sino
+            # Rectangular copy D->D
+            self.copy2d(self.d_sino_padded, d_sino_ref, self.sino_shape)
+            if self.is_cpu:
+                self.d_sino_padded.finish()
+        else:
+            # Numpy backend: FFT/mult/IFFT are done on host.
+            if not(isinstance(sino, np.ndarray)):
+                # Numpy backend + pyopencl input: need to copy D->H
+                self.tmp_sino_host[:] = sino[:]
+                h_sino_ref = self.tmp_sino_host
+            else:
+                h_sino_ref = sino
+            # Rectangular copy H->H
+            self.copy2d_host(self.d_sino_padded, h_sino_ref, self.sino_shape)
+
+    def _get_output_sino(self, output):
+        """
+
+        :param Union[numpy.dtype,None] output: sinogram output.
+        :return: sinogram
+        """
+        if output is None:
+            res = np.zeros(self.sino_shape, dtype=np.float32)
+        else:
+            res = output
+        if self.fft_backend == "opencl":
+            if isinstance(res, np.ndarray):
+                # OpenCL backend + numpy output: copy D->H
+                # As pyopencl does not support rectangular copies, we first have
+                # to call a kernel doing rectangular copy D->D, then do a copy
+                # D->H.
+                self.copy2d(dst=self.tmp_sino_device,
+                            src=self.d_sino_padded,
+                            transfer_shape=self.sino_shape)
+                if self.is_cpu:
+                    self.tmp_sino_device.finish()
+                res[:] = self.tmp_sino_device[:]
+            else:
+                if self.is_cpu:
+                    self.d_sino_padded.finish()
+                self.copy2d(res, self.d_sino_padded, self.sino_shape)
+                if self.is_cpu:
+                    res.finish()
+        else:
+            if not(isinstance(res, np.ndarray)):
+                # Numpy backend + pyopencl output: rect copy H->H + copy H->D
+                self.copy2d_host(dst=self.tmp_sino_host,
+                                 src=self.d_sino_padded,
+                                 transfer_shape=self.sino_shape)
+                res[:] = self.tmp_sino_host[:]
+            else:
+                # Numpy backend + numpy output: rect copy H->H
+                self.copy2d_host(res, self.d_sino_padded, self.sino_shape)
+        return res
+
+    def _do_fft(self):
+        if self.fft_backend == "opencl":
+            self.fft.fft(self.d_sino_padded, output=self.d_sino_f)
+            if self.is_cpu:
+                self.d_sino_f.finish()
+        else:
+            # numpy backend does not support "output=" argument,
+            # and rfft always return a complex128 result.
+            res = self.fft.fft(self.d_sino_padded).astype(np.complex64)
+            self.d_sino_f[:] = res[:]
+
+    def _multiply_fourier(self):
+        if self.fft_backend == "opencl":
+            # Everything is on device. Call the multiplication kernel.
+            self.kernels.inplace_complex_mul_2Dby1D(
+                *self.mult_kern_args
+            )
+            if self.is_cpu:
+                self.d_sino_f.finish()
+        else:
+            # Everything is on host.
+            self.d_sino_f *= self.filter_f
+
+    def _do_ifft(self):
+        if self.fft_backend == "opencl":
+            if self.is_cpu:
+                self.d_sino_padded.fill(0)
+                self.d_sino_padded.finish()
+            self.fft.ifft(self.d_sino_f, output=self.d_sino_padded)
+            if self.is_cpu:
+                self.d_sino_padded.finish()
+        else:
+            # numpy backend does not support "output=" argument,
+            # and irfft always return a float64 result.
+            res = self.fft.ifft(self.d_sino_f).astype("f")
+            self.d_sino_padded[:] = res[:]
+
+    def filter_sino(self, sino, output=None):
+        """
+
+        :param sino: sinogram
+        :param output:
+        :return: filtered sinogram
+        """
+        # Handle input sinogram
+        self._prepare_input_sino(sino)
+        # FFT
+        self._do_fft()
+        # multiply with filter in the Fourier domain
+        self._multiply_fourier()
+        # iFFT
+        self._do_ifft()
+        # return
+        res = self._get_output_sino(output)
+        return res
+        #~ return output
+
+    __call__ = filter_sino
+
+
+
+
+# -------------------
+# - Compatibility  -
+# -------------------
+
+@deprecated(replacement="Backprojection.sino_filter", since_version="0.10")
+def fourier_filter(sino, filter_=None, fft_size=None):
+    """Simple np based implementation of fourier space filter.
+    This function is deprecated, please use silx.opencl.sinofilter.SinoFilter.
+
+    :param sino: of shape shape = (num_projs, num_bins)
+    :param filter: filter function to apply in fourier space
+    :fft_size: size on which perform the fft. May be larger than the sino array
+    :return: filtered sinogram
+    """
+    assert sino.ndim == 2
+    num_projs, num_bins = sino.shape
+    if fft_size is None:
+        fft_size = nextpow2(num_bins * 2 - 1)
+    else:
+        assert fft_size >= num_bins
+    if fft_size == num_bins:
+        sino_zeropadded = sino.astype(np.float32)
+    else:
+        sino_zeropadded = np.zeros((num_projs, fft_size),
+                                      dtype=np.complex64)
+        sino_zeropadded[:, :num_bins] = sino.astype(np.float32)
+
+    if filter_ is None:
+        h = np.zeros(fft_size, dtype=np.float32)
+        L2 = fft_size // 2 + 1
+        h[0] = 1 / 4.
+        j = np.linspace(1, L2, L2 // 2, False)
+        h[1:L2:2] = -1. / (np.pi ** 2 * j ** 2)
+        h[L2:] = np.copy(h[1:L2 - 1][::-1])
+        filter_ = np.fft.fft(h).astype(np.complex64)
+
+    # Linear convolution
+    sino_f = np.fft.fft(sino, fft_size)
+    sino_f = sino_f * filter_
+    sino_filtered = np.fft.ifft(sino_f)[:, :num_bins].real
+
+    return np.ascontiguousarray(sino_filtered.real, dtype=np.float32)
diff --git a/silx/opencl/statistics.py b/silx/opencl/statistics.py
new file mode 100644
index 0000000..bd8e7b7
--- /dev/null
+++ b/silx/opencl/statistics.py
@@ -0,0 +1,224 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: SILX
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2019 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#  .
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#  .
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+"""A module for performing basic statistical analysis (min, max, mean, std) on
+large data where numpy is not very efficient.
+"""
+
+from __future__ import absolute_import, print_function, with_statement, division
+
+
+__author__ = "Jerome Kieffer"
+__license__ = "MIT"
+__date__ = "11/01/2019"
+__copyright__ = "2012-2017, ESRF, Grenoble"
+__contact__ = "jerome.kieffer@esrf.fr"
+
+import logging
+import numpy
+from collections import OrderedDict, namedtuple
+from math import sqrt
+
+from .common import pyopencl
+from .processing import EventDescription, OpenclProcessing, BufferDescription
+from .utils import concatenate_cl_kernel
+
+if pyopencl:
+    mf = pyopencl.mem_flags
+    from pyopencl.reduction import ReductionKernel
+    try:
+        from pyopencl import cltypes
+    except ImportError:
+        v = pyopencl.array.vec()
+        float8 = v.float8
+    else:
+        float8 = cltypes.float8
+
+else:
+    raise ImportError("pyopencl is not installed")
+logger = logging.getLogger(__name__)
+
+StatResults = namedtuple("StatResults", ["min", "max", "cnt", "sum", "mean",
+                                         "var", "std"])
+zero8 = "(float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)"
+#                    min      max    cnt  cnt_e  sum   sum_e  var  var_e
+
+
+class Statistics(OpenclProcessing):
+    """A class for doing statistical analysis using OpenCL
+
+    :param List[int] size: Shape of input data to treat
+    :param numpy.dtype dtype: Input data type
+    :param numpy.ndarray template: Data template to extract size & dtype
+    :param ctx: Actual working context, left to None for automatic
+                initialization from device type or platformid/deviceid
+    :param str devicetype: Type of device, can be "CPU", "GPU", "ACC" or "ALL"
+    :param int platformid: Platform identifier as given by clinfo
+    :param int deviceid: Device identifier as given by clinfo
+    :param int block_size:
+        Preferred workgroup size, may vary depending on the outcome of the compilation
+    :param bool profile:
+        Switch on profiling to be able to profile at the kernel level,
+        store profiling elements (makes code slightly slower)
+    """
+    buffers = [
+        BufferDescription("raw", 1, numpy.float32, mf.READ_ONLY),
+        BufferDescription("converted", 1, numpy.float32, mf.READ_WRITE),
+    ]
+    kernel_files = ["preprocess.cl"]
+    mapping = {numpy.int8: "s8_to_float",
+               numpy.uint8: "u8_to_float",
+               numpy.int16: "s16_to_float",
+               numpy.uint16: "u16_to_float",
+               numpy.uint32: "u32_to_float",
+               numpy.int32: "s32_to_float"}
+
+    def __init__(self, size=None, dtype=None, template=None,
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 block_size=None, profile=False
+                 ):
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  block_size=block_size, profile=profile)
+        self.size = size
+        self.dtype = dtype
+        if template is not None:
+            self.size = template.size
+            self.dtype = template.dtype
+
+        self.buffers = [BufferDescription(i.name, i.size * self.size, i.dtype, i.flags)
+                        for i in self.__class__.buffers]
+
+        self.allocate_buffers(use_array=True)
+        self.compile_kernels()
+        self.set_kernel_arguments()
+
+    def set_kernel_arguments(self):
+        """Parametrize all kernel arguments"""
+        for val in self.mapping.values():
+            self.cl_kernel_args[val] = OrderedDict(((i, self.cl_mem[i]) for i in ("raw", "converted")))
+
+    def compile_kernels(self):
+        """Compile the kernel"""
+        OpenclProcessing.compile_kernels(self,
+                                         self.kernel_files,
+                                         "-D NIMAGE=%i" % self.size)
+        compiler_options = self.get_compiler_options(x87_volatile=True)
+        src = concatenate_cl_kernel(("kahan.cl", "statistics.cl"))
+        self.reduction_comp = ReductionKernel(self.ctx,
+                                              dtype_out=float8,
+                                              neutral=zero8,
+                                              map_expr="map_statistics(data, i)",
+                                              reduce_expr="reduce_statistics(a,b)",
+                                              arguments="__global float *data",
+                                              preamble=src,
+                                              options=compiler_options)
+        self.reduction_simple = ReductionKernel(self.ctx,
+                                                dtype_out=float8,
+                                                neutral=zero8,
+                                                map_expr="map_statistics(data, i)",
+                                                reduce_expr="reduce_statistics_simple(a,b)",
+                                                arguments="__global float *data",
+                                                preamble=src,
+                                                options=compiler_options)
+
+    def send_buffer(self, data, dest):
+        """
+        Send a numpy array to the device, including the cast on the device if
+        possible
+
+        :param numpy.ndarray data: numpy array with data
+        :param dest: name of the buffer as registered in the class
+        """
+
+        dest_type = numpy.dtype([i.dtype for i in self.buffers if i.name == dest][0])
+        events = []
+        if (data.dtype == dest_type) or (data.dtype.itemsize > dest_type.itemsize):
+            copy_image = pyopencl.enqueue_copy(self.queue,
+                                               self.cl_mem[dest].data,
+                                               numpy.ascontiguousarray(data, dest_type))
+            events.append(EventDescription("copy H->D %s" % dest, copy_image))
+        else:
+            copy_image = pyopencl.enqueue_copy(self.queue,
+                                               self.cl_mem["raw"].data,
+                                               numpy.ascontiguousarray(data))
+            kernel = getattr(self.program, self.mapping[data.dtype.type])
+            cast_to_float = kernel(self.queue,
+                                   (self.size,),
+                                   None,
+                                   self.cl_mem["raw"].data,
+                                   self.cl_mem[dest].data)
+            events += [
+                EventDescription("copy H->D %s" % dest, copy_image),
+                EventDescription("cast to float", cast_to_float)
+            ]
+        if self.profile:
+            self.events += events
+        return events
+
+    def process(self, data, comp=True):
+        """Actually calculate the statics on the data
+
+        :param numpy.ndarray data: numpy array with the image
+        :param comp: use Kahan compensated arithmetics for the calculation 
+        :return: Statistics named tuple
+        :rtype: StatResults
+        """
+        if data.ndim != 1:
+            data = data.ravel()
+        size = data.size
+        assert size <= self.size, "size is OK"
+        events = []
+        with self.sem:
+            self.send_buffer(data, "converted")
+            if comp:
+                reduction = self.reduction_comp
+            else:
+                reduction = self.reduction_simple
+            res_d, evt = reduction(self.cl_mem["converted"][:self.size],
+                                   queue=self.queue,
+                                   return_event=True)
+            events.append(EventDescription("statistical reduction %s" % ("comp"if comp else "simple"), evt))
+            if self.profile:
+                self.events += events
+            res_h = res_d.get()
+        min_ = 1.0 * res_h["s0"]
+        max_ = 1.0 * res_h["s1"]
+        count = 1.0 * res_h["s2"] + res_h["s3"]
+        sum_ = 1.0 * res_h["s4"] + res_h["s5"]
+        m2 = 1.0 * res_h["s6"] + res_h["s7"]
+        var = m2 / (count - 1.0)
+        res = StatResults(min_,
+                          max_,
+                          count,
+                          sum_,
+                          sum_ / count,
+                          var,
+                          sqrt(var))
+        return res
+
+    __call__ = process
diff --git a/silx/opencl/test/__init__.py b/silx/opencl/test/__init__.py
index 2fe88ea..f3121d5 100644
--- a/silx/opencl/test/__init__.py
+++ b/silx/opencl/test/__init__.py
@@ -24,7 +24,7 @@
 
 __authors__ = ["J. Kieffer"]
 __license__ = "MIT"
-__date__ = "17/10/2017"
+__date__ = "11/01/2019"
 
 import os
 import unittest
@@ -36,6 +36,9 @@ from . import test_linalg
 from . import test_array_utils
 from ..codec import test as test_codec
 from . import test_image
+from . import test_kahan
+from . import test_stats
+
 
 def suite():
     test_suite = unittest.TestSuite()
@@ -47,6 +50,8 @@ def suite():
     test_suite.addTests(test_array_utils.suite())
     test_suite.addTests(test_codec.suite())
     test_suite.addTests(test_image.suite())
+    test_suite.addTests(test_kahan.suite())
+    test_suite.addTests(test_stats.suite())
     # Allow to remove sift from the project
     test_base_dir = os.path.dirname(__file__)
     sift_dir = os.path.join(test_base_dir, "..", "sift")
diff --git a/silx/opencl/test/test_backprojection.py b/silx/opencl/test/test_backprojection.py
index 70ce2ae..7ab416d 100644
--- a/silx/opencl/test/test_backprojection.py
+++ b/silx/opencl/test/test_backprojection.py
@@ -35,8 +35,9 @@ __date__ = "19/01/2018"
 
 import time
 import logging
-import numpy
+import numpy as np
 import unittest
+from math import pi
 try:
     import mako
 except ImportError:
@@ -44,6 +45,7 @@ except ImportError:
 from ..common import ocl
 if ocl:
     from .. import backprojection
+    from ..sinofilter import compute_fourier_filter
 from silx.test.utils import utilstest
 
 logger = logging.getLogger(__name__)
@@ -55,7 +57,7 @@ def generate_coords(img_shp, center=None):
     The zero is at the center of the image.
     """
     l_r, l_c = float(img_shp[0]), float(img_shp[1])
-    R, C = numpy.mgrid[:l_r, :l_c]
+    R, C = np.mgrid[:l_r, :l_c]
     if center is None:
         center0, center1 = l_r / 2., l_c / 2.
     else:
@@ -71,7 +73,7 @@ def clip_circle(img, center=None, radius=None):
     """
     R, C = generate_coords(img.shape, center)
     M = R * R + C * C
-    res = numpy.zeros_like(img)
+    res = np.zeros_like(img)
     if radius is None:
         radius = img.shape[0] / 2. - 1
     mask = M < radius * radius
@@ -85,23 +87,29 @@ class TestFBP(unittest.TestCase):
     def setUp(self):
         if ocl is None:
             return
-        # ~ if sys.platform.startswith('darwin'):
-            # ~ self.skipTest("Backprojection is not implemented on CPU for OS X yet")
         self.getfiles()
         self.fbp = backprojection.Backprojection(self.sino.shape, profile=True)
         if self.fbp.compiletime_workgroup_size < 16 * 16:
-            self.skipTest("Current implementation of OpenCL backprojection is not supported on this platform yet")
+            self.skipTest("Current implementation of OpenCL backprojection is "
+                          "not supported on this platform yet")
+        # Astra does not use the same backprojector implementation.
+        # Therefore, we cannot expect results to be the "same" (up to float32
+        # numerical error)
+        self.tol = 5e-2
+        if self.fbp.is_cpu:
+            # Precision is less when using CPU
+            self.tol *= 2
 
     def tearDown(self):
         self.sino = None
-#         self.fbp.log_profile()
+        # self.fbp.log_profile()
         self.fbp = None
 
     def getfiles(self):
         # load sinogram of 512x512 MRI phantom
-        self.sino = numpy.load(utilstest.getfile("sino500.npz"))["data"]
+        self.sino = np.load(utilstest.getfile("sino500.npz"))["data"]
         # load reconstruction made with ASTRA FBP (with filter designed in spatial domain)
-        self.reference_rec = numpy.load(utilstest.getfile("rec_astra_500.npz"))["data"]
+        self.reference_rec = np.load(utilstest.getfile("rec_astra_500.npz"))["data"]
 
     def measure(self):
         "Common measurement of timings"
@@ -124,8 +132,8 @@ class TestFBP(unittest.TestCase):
         ref_clipped = clip_circle(self.reference_rec)
         delta = abs(res_clipped - ref_clipped)
         bad = delta > 1
-#         numpy.save("/tmp/bad.npy", bad.astype(int))
-        logger.debug("Absolute difference: %s with %s outlier pixels out of %s", delta.max(), bad.sum(), numpy.prod(bad.shape))
+        logger.debug("Absolute difference: %s with %s outlier pixels out of %s"
+                     "", delta.max(), bad.sum(), np.prod(bad.shape))
         return delta.max()
 
     @unittest.skipUnless(ocl and mako, "pyopencl is missing")
@@ -143,21 +151,59 @@ class TestFBP(unittest.TestCase):
             err = self.compare(res)
             msg = str("Max error = %e" % err)
             logger.info(msg)
-            # TODO: cannot do better than 1e0 ?
-            # The plain backprojection was much better, so it must be an issue in the filtering process
-            self.assertTrue(err < 1., "Max error is too high")
+            self.assertTrue(err < self.tol, "Max error is too high")
+
         # Test multiple reconstructions
         # -----------------------------
-        res0 = numpy.copy(res)
+        res0 = np.copy(res)
         for i in range(10):
             res = self.fbp.filtered_backprojection(self.sino)
-            errmax = numpy.max(numpy.abs(res - res0))
+            errmax = np.max(np.abs(res - res0))
             self.assertTrue(errmax < 1.e-6, "Max error is too high")
 
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_fbp_filters(self):
+        """
+        Test the different available filters of silx FBP.
+        """
+        avail_filters = [
+            "ramlak", "shepp-logan", "cosine", "hamming",
+            "hann"
+        ]
+        # Create a Dirac delta function at a single angle view.
+        # As the filters are radially invarant:
+        #   - backprojection yields an image where each line is a Dirac.
+        #   - FBP yields an image where each line is the spatial filter
+        # One can simply filter "dirac" without backprojecting it, but this
+        # test will also ensure that backprojection behaves well.
+        dirac = np.zeros_like(self.sino)
+        na, dw = dirac.shape
+        dirac[0, dw//2] = na / pi * 2
+
+        for filter_name in avail_filters:
+            B = backprojection.Backprojection(dirac.shape, filter_name=filter_name)
+            r = B(dirac)
+            # Check that radial invariance is kept
+            std0 = np.max(np.abs(np.std(r, axis=0)))
+            self.assertTrue(
+                std0 < 5.e-6,
+                "Something wrong with FBP(filter=%s)" % filter_name
+            )
+            # Check that the filter is retrieved
+            r_f = np.fft.fft(np.fft.fftshift(r[0])).real / 2.  # filter factor
+            ref_filter_f = compute_fourier_filter(dw, filter_name)
+            errmax = np.max(np.abs(r_f - ref_filter_f))
+            logger.info("FBP filter %s: max error=%e" % (filter_name, errmax))
+            self.assertTrue(
+                errmax < 1.e-3,
+                "Something wrong with FBP(filter=%s)" % filter_name
+            )
+
 
 def suite():
     testSuite = unittest.TestSuite()
     testSuite.addTest(TestFBP("test_fbp"))
+    testSuite.addTest(TestFBP("test_fbp_filters"))
     return testSuite
 
 
diff --git a/silx/opencl/test/test_kahan.py b/silx/opencl/test/test_kahan.py
new file mode 100644
index 0000000..bb9ea3f
--- /dev/null
+++ b/silx/opencl/test/test_kahan.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python
+# coding: utf-8
+#
+#    Project: Azimuthal integration
+#             https://github.com/silx-kit/pyFAI
+#
+#    Copyright (C) 2015-2019 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"test suite for OpenCL code"
+
+from __future__ import absolute_import, division, print_function
+
+__author__ = "Jérôme Kieffer"
+__contact__ = "Jerome.Kieffer@ESRF.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "30/01/2019"
+
+
+import unittest
+import numpy
+import logging
+import platform
+
+logger = logging.getLogger(__name__)
+try:
+    import pyopencl
+except ImportError as error:
+    logger.warning("OpenCL module (pyopencl) is not present, skip tests. %s.", error)
+    pyopencl = None
+
+from .. import ocl
+if ocl is not None:
+    from ..utils import read_cl_file
+    from .. import pyopencl
+    import pyopencl.array
+from ...test.utils import test_options
+
+
+class TestKahan(unittest.TestCase):
+    """
+    Test the kernels for compensated math in OpenCL
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        if not test_options.WITH_OPENCL_TEST:
+            raise unittest.SkipTest("User request to skip OpenCL tests")
+        if pyopencl is None or ocl is None:
+            raise unittest.SkipTest("OpenCL module (pyopencl) is not present or no device available")
+
+        cls.ctx = ocl.create_context(devicetype="GPU")
+        cls.queue = pyopencl.CommandQueue(cls.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+
+        # this is running 32 bits OpenCL woth POCL
+        if (platform.machine() in ("i386", "i686", "x86_64") and (tuple.__itemsize__ == 4) and
+                cls.ctx.devices[0].platform.name == 'Portable Computing Language'):
+            cls.args = "-DX87_VOLATILE=volatile"
+        else:
+            cls.args = ""
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.queue = None
+        cls.ctx = None
+
+    @staticmethod
+    def dummy_sum(ary, dtype=None):
+        "perform the actual sum in a dummy way "
+        if dtype is None:
+            dtype = ary.dtype.type
+        sum_ = dtype(0)
+        for i in ary:
+            sum_ += i
+        return sum_
+
+    def test_kahan(self):
+        # simple test
+        N = 26
+        data = (1 << (N - 1 - numpy.arange(N))).astype(numpy.float32)
+
+        ref64 = numpy.sum(data, dtype=numpy.float64)
+        ref32 = self.dummy_sum(data)
+        if (ref64 == ref32):
+            logger.warning("Kahan: invalid tests as float32 provides the same result as float64")
+        # Dummy kernel to evaluate
+        src = """
+        kernel void summation(global float* data,
+                                           int size,
+                                    global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            for (int i=0; i<size; i++)
+            {
+                acc = kahan_sum(acc, data[i]);
+            }
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+        """
+        prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(self.args)
+        ones_d = pyopencl.array.to_device(self.queue, data)
+        res_d = pyopencl.array.zeros(self.queue, 2, numpy.float32)
+        evt = prg.summation(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype=numpy.float64)
+        self.assertEqual(ref64, res, "test_kahan")
+
+    def test_dot16(self):
+        # simple test
+        N = 16
+        data = (1 << (N - 1 - numpy.arange(N))).astype(numpy.float32)
+
+        ref64 = numpy.dot(data.astype(numpy.float64), data.astype(numpy.float64))
+        ref32 = numpy.dot(data, data)
+        if (ref64 == ref32):
+            logger.warning("dot16: invalid tests as float32 provides the same result as float64")
+        # Dummy kernel to evaluate
+        src = """
+        kernel void test_dot16(global float* data,
+                                           int size,
+                               global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            float16 data16 = (float16) (data[0],data[1],data[2],data[3],data[4],
+                                        data[5],data[6],data[7],data[8],data[9],
+                         data[10],data[11],data[12],data[13],data[14],data[15]);
+            acc = comp_dot16(data16, data16);
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+
+        kernel void test_dot8(global float* data,
+                                           int size,
+                               global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            float8 data0 = (float8) (data[0],data[2],data[4],data[6],data[8],data[10],data[12],data[14]);
+            float8 data1 = (float8) (data[1],data[3],data[5],data[7],data[9],data[11],data[13],data[15]);
+            acc = comp_dot8(data0, data1);
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+
+        kernel void test_dot4(global float* data,
+                                           int size,
+                               global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            float4 data0 = (float4) (data[0],data[4],data[8],data[12]);
+            float4 data1 = (float4) (data[3],data[7],data[11],data[15]);
+            acc = comp_dot4(data0, data1);
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+
+        kernel void test_dot3(global float* data,
+                                           int size,
+                               global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            float3 data0 = (float3) (data[0],data[4],data[12]);
+            float3 data1 = (float3) (data[3],data[11],data[15]);
+            acc = comp_dot3(data0, data1);
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+
+        kernel void test_dot2(global float* data,
+                                           int size,
+                               global float* result)
+        {
+            float2 acc = (float2)(0.0f, 0.0f);
+            float2 data0 = (float2) (data[0],data[14]);
+            float2 data1 = (float2) (data[1],data[15]);
+            acc = comp_dot2(data0, data1);
+            result[0] = acc.s0;
+            result[1] = acc.s1;
+        }
+
+        """
+
+        prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(self.args)
+        ones_d = pyopencl.array.to_device(self.queue, data)
+        res_d = pyopencl.array.zeros(self.queue, 2, numpy.float32)
+        evt = prg.test_dot16(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype="float64")
+        self.assertEqual(ref64, res, "test_dot16")
+
+        res_d.fill(0)
+        data0 = data[0::2]
+        data1 = data[1::2]
+        ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
+        ref32 = numpy.dot(data0, data1)
+        if (ref64 == ref32):
+            logger.warning("dot8: invalid tests as float32 provides the same result as float64")
+        evt = prg.test_dot8(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype="float64")
+        self.assertEqual(ref64, res, "test_dot8")
+
+        res_d.fill(0)
+        data0 = data[0::4]
+        data1 = data[3::4]
+        ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
+        ref32 = numpy.dot(data0, data1)
+        if (ref64 == ref32):
+            logger.warning("dot4: invalid tests as float32 provides the same result as float64")
+        evt = prg.test_dot4(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype="float64")
+        self.assertEqual(ref64, res, "test_dot4")
+
+        res_d.fill(0)
+        data0 = numpy.array([data[0], data[4], data[12]])
+        data1 = numpy.array([data[3], data[11], data[15]])
+        ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
+        ref32 = numpy.dot(data0, data1)
+        if (ref64 == ref32):
+            logger.warning("dot3: invalid tests as float32 provides the same result as float64")
+        evt = prg.test_dot3(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype="float64")
+        self.assertEqual(ref64, res, "test_dot3")
+
+        res_d.fill(0)
+        data0 = numpy.array([data[0], data[14]])
+        data1 = numpy.array([data[1], data[15]])
+        ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
+        ref32 = numpy.dot(data0, data1)
+        if (ref64 == ref32):
+            logger.warning("dot2: invalid tests as float32 provides the same result as float64")
+        evt = prg.test_dot2(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt.wait()
+        res = res_d.get().sum(dtype="float64")
+        self.assertEqual(ref64, res, "test_dot2")
+
+
+def suite():
+    testsuite = unittest.TestSuite()
+    loader = unittest.defaultTestLoader.loadTestsFromTestCase
+    testsuite.addTest(loader(TestKahan))
+    return testsuite
+
+
+if __name__ == '__main__':
+    runner = unittest.TextTestRunner()
+    runner.run(suite())
diff --git a/silx/opencl/test/test_stats.py b/silx/opencl/test/test_stats.py
new file mode 100644
index 0000000..b5127c8
--- /dev/null
+++ b/silx/opencl/test/test_stats.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Sift implementation in Python + OpenCL
+#             https://github.com/silx-kit/silx
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Simple test of an addition
+"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Henri Payno, Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "13/12/2018"
+
+import logging
+import time
+import numpy
+
+import unittest
+from ..common import ocl
+if ocl:
+    import pyopencl
+    import pyopencl.array
+    from ..statistics import StatResults, Statistics
+from ..utils import get_opencl_code
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipUnless(ocl, "PyOpenCl is missing")
+class TestStatistics(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.size = 1 << 20  # 1 million elements
+        cls.data = numpy.random.randint(0, 65000, cls.size).astype("uint16")
+        t0 = time.time()
+        cls.ref = StatResults(cls.data.min(), cls.data.max(), cls.data.size,
+                              cls.data.sum(), cls.data.mean(), cls.data.std() ** 2,
+                              cls.data.std())
+        t1 = time.time()
+        cls.ref_time = t1 - t0
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.size = cls.ref = cls.data = cls.ref_time = None
+
+    @classmethod
+    def validate(cls, res):
+        return (
+            (res.min == cls.ref.min) and
+            (res.max == cls.ref.max) and
+            (res.cnt == cls.ref.cnt) and
+            abs(res.mean - cls.ref.mean) < 0.01 and
+            abs(res.std - cls.ref.std) < 0.1)
+
+    def test_measurement(self):
+        """
+        tests that all devices are working properly ...
+        """
+        logger.info("Reference results: %s", self.ref)
+        for pid, platform in enumerate(ocl.platforms):
+            for did, device in enumerate(platform.devices):
+                try:
+                    s = Statistics(template=self.data, platformid=pid, deviceid=did)
+                except Exception as err:
+                    failed_init = True
+                    res = StatResults(0,0,0,0,0,0,0)
+                else:
+                    failed_init = False
+                    t0 = time.time()
+                    res = s(self.data)
+                    t1 = time.time()
+                logger.warning("failed_init %s", failed_init)
+                if failed_init or not self.validate(res):
+                    logger.error("Failed on platform %s device %s", platform, device)
+                    logger.error("Reference results: %s", self.ref)
+                    logger.error("Faulty results: %s", res)
+                    self.assertTrue(False, "Stat calculation failed on %s %s" % (platform, device))
+                logger.info("Runtime on %s/%s : %.3fms x%.1f", platform, device, 1000 * (t1 - t0), self.ref_time / (t1 - t0))
+
+
+def suite():
+    testSuite = unittest.TestSuite()
+    testSuite.addTest(TestStatistics("test_measurement"))
+    return testSuite
+
+
+if __name__ == '__main__':
+    unittest.main(defaultTest="suite")