36 files changed, 2550 insertions, 1462 deletions
diff --git a/src/silx/opencl/__init__.py b/src/silx/opencl/__init__.py
index fbd1f88..466ffaf 100644
--- a/src/silx/opencl/__init__.py
+++ b/src/silx/opencl/__init__.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 #
 #    Project: S I L X project
 #             https://github.com/silx-kit/silx
diff --git a/src/silx/opencl/atomic.py b/src/silx/opencl/atomic.py
new file mode 100644
index 0000000..16d3eff
--- /dev/null
+++ b/src/silx/opencl/atomic.py
@@ -0,0 +1,93 @@
+#
+#    Project: S I L X project
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2023 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#  .
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#  .
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+"""
+Utilities around atomic operation in OpenCL
+"""
+
+__author__ = "Jerome Kieffer"
+__license__ = "MIT"
+__date__ = "14/06/2023"
+__copyright__ = "2023-2023, ESRF, Grenoble"
+__contact__ = "jerome.kieffer@esrf.fr"
+
+import numpy
+import pyopencl
+import pyopencl.array as cla
+
+
+def check_atomic32(device):
+    try:
+        ctx = pyopencl.Context(devices=[device])
+    except:
+        return False, f"Unable to create context on {device}"
+    else:
+        queue = pyopencl.CommandQueue(ctx)
+    src = """
+kernel void check_atomic32(global int* ary){
+int res = atom_inc(ary);
+}
+"""
+    try:
+        prg = pyopencl.Program(ctx, src).build()
+    except Exception as err:
+        return False, f"{type(err)}: {err}"
+    a = numpy.zeros(1, numpy.int32)
+    d = cla.to_device(queue, a)
+    prg.check_atomic32(queue, (1024,), (32,), d.data).wait()
+    value = d.get()[0]
+    return value == 1024, f"Got the proper value 1024=={value}"
+
+
+def check_atomic64(device):
+    try:
+        ctx = pyopencl.Context(devices=[device])
+    except:
+        return False, f"Unable to create context on {device}"
+    else:
+        queue = pyopencl.CommandQueue(ctx)
+    if (
+        device.platform.name == "Portable Computing Language"
+        and "GPU" in pyopencl.device_type.to_string(device.type).upper()
+    ):
+        # this configuration is known to seg-fault
+        return False, "PoCL + GPU do not support atomic64"
+    src = """
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+kernel void check_atomic64(global long* ary){
+long res = atom_inc(ary);
+}
+"""
+    try:
+        prg = pyopencl.Program(ctx, src).build()
+    except Exception as err:
+        return False, f"{type(err)}: {err}"
+    a = numpy.zeros(1, numpy.int64)
+    d = cla.to_device(queue, a)
+    prg.check_atomic64(queue, (1024,), (32,), d.data).wait()
+    value = d.get()[0]
+    return value == 1024, f"Got the proper value 1024=={value}"
diff --git a/src/silx/opencl/backprojection.py b/src/silx/opencl/backprojection.py
index 65a9836..5af2bc5 100644
--- a/src/silx/opencl/backprojection.py
+++ b/src/silx/opencl/backprojection.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
-# Copyright (c) 2016 European Synchrotron Radiation Facility
+# Copyright (c) 2016-2023 European Synchrotron Radiation Facility
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -25,8 +24,6 @@
 # ###########################################################################*/
 """Module for (filtered) backprojection on the GPU"""
 
-from __future__ import absolute_import, print_function, with_statement, division
-
 __authors__ = ["A. Mirone, P. Paleo"]
 __license__ = "MIT"
 __date__ = "25/01/2019"
@@ -37,8 +34,6 @@ import numpy as np
 from .common import pyopencl
 from .processing import EventDescription, OpenclProcessing, BufferDescription
 from .sinofilter import SinoFilter
-from .sinofilter import fourier_filter as fourier_filter_
-from ..utils.deprecation import deprecated
 
 if pyopencl:
     mf = pyopencl.mem_flags
@@ -64,12 +59,23 @@ def _idivup(a, b):
 
 class Backprojection(OpenclProcessing):
     """A class for performing the backprojection using OpenCL"""
+
     kernel_files = ["backproj.cl", "array_utils.cl"]
 
-    def __init__(self, sino_shape, slice_shape=None, axis_position=None,
-                 angles=None, filter_name=None, ctx=None, devicetype="all",
-                 platformid=None, deviceid=None, profile=False,
-                 extra_options=None):
+    def __init__(
+        self,
+        sino_shape,
+        slice_shape=None,
+        axis_position=None,
+        angles=None,
+        filter_name=None,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        profile=False,
+        extra_options=None,
+    ):
         """Constructor of the OpenCL (filtered) backprojection
 
         :param sino_shape: shape of the sinogram. The sinogram is in the format
@@ -101,19 +107,26 @@ class Backprojection(OpenclProcessing):
         #  assuming no discrete GPU
         #    raise NotImplementedError("Backprojection is not implemented on CPU for OS X yet")
 
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  profile=profile)
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            profile=profile,
+        )
 
-        self._init_geometry(sino_shape, slice_shape, angles, axis_position,
-                           extra_options)
+        self._init_geometry(
+            sino_shape, slice_shape, angles, axis_position, extra_options
+        )
         self._allocate_memory()
         self._compute_angles()
         self._init_kernels()
         self._init_filter(filter_name)
 
-    def _init_geometry(self, sino_shape, slice_shape, angles, axis_position,
-                      extra_options):
+    def _init_geometry(
+        self, sino_shape, slice_shape, angles, axis_position, extra_options
+    ):
         """Geometry Initialization
 
         :param sino_shape: shape of the sinogram. The sinogram is in the format
@@ -137,12 +150,12 @@ class Backprojection(OpenclProcessing):
             self.slice_shape = slice_shape
         self.dimrec_shape = (
             _idivup(self.slice_shape[0], 32) * 32,
-            _idivup(self.slice_shape[1], 32) * 32
+            _idivup(self.slice_shape[1], 32) * 32,
         )
         if axis_position:
             self.axis_pos = np.float32(axis_position)
         else:
-            self.axis_pos = np.float32((sino_shape[1] - 1.) / 2)
+            self.axis_pos = np.float32((sino_shape[1] - 1.0) / 2)
         self.axis_array = None  # TODO: add axis correction front-end
         self._init_extra_options(extra_options)
 
@@ -152,11 +165,11 @@ class Backprojection(OpenclProcessing):
         :param dict extra_options: Advanced extra options
         """
         self.extra_options = {
-            "cutoff": 1.,
+            "cutoff": 1.0,
             "use_numpy_fft": False,
             # It is  axis_pos - (num_bins-1)/2  in PyHST
-            "gpu_offset_x": 0., #self.axis_pos - (self.num_bins - 1) / 2.,
-            "gpu_offset_y": 0., #self.axis_pos - (self.num_bins - 1) / 2.
+            "gpu_offset_x": 0.0,  # self.axis_pos - (self.num_bins - 1) / 2.,
+            "gpu_offset_y": 0.0,  # self.axis_pos - (self.num_bins - 1) / 2.
         }
         if extra_options is not None:
             self.extra_options.update(extra_options)
@@ -169,7 +182,9 @@ class Backprojection(OpenclProcessing):
         # Device memory
         self.buffers = [
             BufferDescription("_d_slice", self.dimrec_shape, np.float32, mf.READ_WRITE),
-            BufferDescription("d_sino", self.shape, np.float32, mf.READ_WRITE),  # before transferring to texture (if available)
+            BufferDescription(
+                "d_sino", self.shape, np.float32, mf.READ_WRITE
+            ),  # before transferring to texture (if available)
             BufferDescription("d_cos", (self.num_projs,), np.float32, mf.READ_ONLY),
             BufferDescription("d_sin", (self.num_projs,), np.float32, mf.READ_ONLY),
             BufferDescription("d_axes", (self.num_projs,), np.float32, mf.READ_ONLY),
@@ -194,27 +209,29 @@ class Backprojection(OpenclProcessing):
         if self.axis_array:
             self.cl_mem["d_axes"][:] = self.axis_array.astype(np.float32)[:]
         else:
-            self.cl_mem["d_axes"][:] = np.ones(self.num_projs, dtype="f") * self.axis_pos
+            self.cl_mem["d_axes"][:] = (
+                np.ones(self.num_projs, dtype="f") * self.axis_pos
+            )
 
     def _init_kernels(self):
         compile_options = None
-        if not(self._use_textures):
+        if not (self._use_textures):
             compile_options = "-DDONT_USE_TEXTURES"
         OpenclProcessing.compile_kernels(
-            self,
-            self.kernel_files,
-            compile_options=compile_options
+            self, self.kernel_files, compile_options=compile_options
         )
         # check that workgroup can actually be (16, 16)
-        self.compiletime_workgroup_size = self.kernels.max_workgroup_size("backproj_cpu_kernel")
+        self.compiletime_workgroup_size = self.kernels.max_workgroup_size(
+            "backproj_cpu_kernel"
+        )
         # Workgroup and ndrange sizes are always the same
         self.wg = (16, 16)
         self.ndrange = (
             _idivup(int(self.dimrec_shape[1]), 32) * self.wg[0],
-            _idivup(int(self.dimrec_shape[0]), 32) * self.wg[1]
+            _idivup(int(self.dimrec_shape[0]), 32) * self.wg[1],
         )
         # Prepare arguments for the kernel call
-        if not(self._use_textures):
+        if not (self._use_textures):
             d_sino_ref = self.d_sino.data
         else:
             d_sino_ref = self.d_sino_tex
@@ -229,7 +246,7 @@ class Backprojection(OpenclProcessing):
             self.cl_mem["_d_slice"].data,
             # d_sino (__read_only image2d_t or float*)
             d_sino_ref,
-            # gpu_offset_x (float32) 
+            # gpu_offset_x (float32)
             np.float32(self.extra_options["gpu_offset_x"]),
             # gpu_offset_y (float32)
             np.float32(self.extra_options["gpu_offset_y"]),
@@ -240,7 +257,7 @@ class Backprojection(OpenclProcessing):
             # d_axis  (__global float32*)
             self.cl_mem["d_axes"].data,
             # shared mem (__local float32*)
-            self._get_local_mem()
+            self._get_local_mem(),
         )
 
     def _allocate_textures(self):
@@ -276,7 +293,7 @@ class Backprojection(OpenclProcessing):
             np.int32(self.dimrec_shape[1]),
             np.int32((0, 0)),
             np.int32((0, 0)),
-            slice_shape_ocl
+            slice_shape_ocl,
         )
         return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args)
 
@@ -284,47 +301,39 @@ class Backprojection(OpenclProcessing):
         if isinstance(sino, parray.Array):
             return self._transfer_device_to_texture(sino)
         sino2 = sino
-        if not(sino.flags["C_CONTIGUOUS"] and sino.dtype == np.float32):
+        if not (sino.flags["C_CONTIGUOUS"] and sino.dtype == np.float32):
             sino2 = np.ascontiguousarray(sino, dtype=np.float32)
-        if not(self._use_textures):
-            ev = pyopencl.enqueue_copy(
-                                        self.queue,
-                                        self.d_sino.data,
-                                        sino2
-                                        )
+        if not (self._use_textures):
+            ev = pyopencl.enqueue_copy(self.queue, self.d_sino.data, sino2)
             what = "transfer filtered sino H->D buffer"
             ev.wait()
         else:
             ev = pyopencl.enqueue_copy(
-                                       self.queue,
-                                       self.d_sino_tex,
-                                       sino2,
-                                       origin=(0, 0),
-                                       region=self.shape[::-1]
-                                       )
+                self.queue,
+                self.d_sino_tex,
+                sino2,
+                origin=(0, 0),
+                region=self.shape[::-1],
+            )
             what = "transfer filtered sino H->D texture"
         return EventDescription(what, ev)
 
     def _transfer_device_to_texture(self, d_sino):
-        if not(self._use_textures):
+        if not (self._use_textures):
             if id(self.d_sino) == id(d_sino):
                 return
-            ev = pyopencl.enqueue_copy(
-                                       self.queue,
-                                       self.d_sino.data,
-                                       d_sino
-                                       )
+            ev = pyopencl.enqueue_copy(self.queue, self.d_sino.data, d_sino)
             what = "transfer filtered sino D->D buffer"
             ev.wait()
         else:
             ev = pyopencl.enqueue_copy(
-                                       self.queue,
-                                       self.d_sino_tex,
-                                       d_sino.data,
-                                       offset=0,
-                                       origin=(0, 0),
-                                       region=self.shape[::-1]
-                                       )
+                self.queue,
+                self.d_sino_tex,
+                d_sino.data,
+                offset=0,
+                origin=(0, 0),
+                region=self.shape[::-1],
+            )
             what = "transfer filtered sino D->D texture"
         return EventDescription(what, ev)
 
@@ -340,20 +349,17 @@ class Backprojection(OpenclProcessing):
         with self.sem:
             events.append(self._transfer_to_texture(sino))
             # Call the backprojection kernel
-            if not(self._use_textures):
+            if not (self._use_textures):
                 kernel_to_call = self.kernels.backproj_cpu_kernel
             else:
                 kernel_to_call = self.kernels.backproj_kernel
             kernel_to_call(
-                self.queue,
-                self.ndrange,
-                self.wg,
-                *self._backproj_kernel_args
+                self.queue, self.ndrange, self.wg, *self._backproj_kernel_args
             )
             # Return
             if output is None:
                 res = self.cl_mem["_d_slice"].get()
-                res = res[:self.slice_shape[0], :self.slice_shape[1]]
+                res = res[: self.slice_shape[0], : self.slice_shape[1]]
             else:
                 res = output
                 self._cpy2d_to_slice(output)
@@ -380,18 +386,3 @@ class Backprojection(OpenclProcessing):
         return res
 
     __call__ = filtered_backprojection
-
-
-    # -------------------
-    # - Compatibility  -
-    # -------------------
-
-    @deprecated(replacement="Backprojection.sino_filter", since_version="0.10")
-    def filter_projections(self, sino, rescale=True):
-        self.sino_filter(sino, output=self.d_sino)
-
-
-
-def fourier_filter(sino, filter_=None, fft_size=None):
-    return fourier_filter_(sino, filter_=filter_, fft_size=fft_size)
-
diff --git a/src/silx/opencl/codec/bitshuffle_lz4.py b/src/silx/opencl/codec/bitshuffle_lz4.py
new file mode 100644
index 0000000..b0992b9
--- /dev/null
+++ b/src/silx/opencl/codec/bitshuffle_lz4.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+#
+#    Project: Sift implementation in Python + OpenCL
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2022-2023  European Synchrotron Radiation Facility, Grenoble, France
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+This module provides a class for CBF byte offset compression/decompression.
+"""
+
+__authors__ = ["Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "09/11/2022"
+__status__ = "production"
+
+
+import os
+import struct
+import numpy
+from ..common import ocl, pyopencl, kernel_workgroup_size
+from ..processing import BufferDescription, EventDescription, OpenclProcessing
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class BitshuffleLz4(OpenclProcessing):
+    """Perform the bitshuffle-lz4 decompression on the GPU
+    See :class:`OpenclProcessing` for optional arguments description.
+    :param int cmp_size:
+        Size of the raw stream for decompression.
+        It can be (slightly) larger than the array.
+    :param int dec_size:
+        Size of the decompression output array
+        (mandatory for decompression)
+    :param dtype: dtype of decompressed data
+    """
+
+    LZ4_BLOCK_SIZE = 8192
+
+    def __init__(
+        self,
+        cmp_size,
+        dec_size,
+        dtype,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        block_size=None,
+        profile=False,
+    ):
+        """Constructor of the class:
+
+        :param cmp_size: size of the compressed data buffer (in bytes)
+        :param dec_size: size of the compressed data buffer (in words)
+        :param dtype: data type of one work in decompressed array
+
+        For the other, see the doc of OpenclProcessing
+        """
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            block_size=block_size,
+            profile=profile,
+        )
+        if self.block_size is None:
+            try:
+                self.block_size = self.ctx.devices[0].preferred_work_group_size_multiple
+            except:
+                self.block_size = self.device.max_work_group_size
+
+        self.cmp_size = numpy.uint64(cmp_size)
+        self.dec_size = numpy.uint64(dec_size)
+        self.dec_dtype = numpy.dtype(dtype)
+        self.num_blocks = numpy.uint32(
+            (self.dec_dtype.itemsize * self.dec_size + self.LZ4_BLOCK_SIZE - 1)
+            // self.LZ4_BLOCK_SIZE
+        )
+
+        buffers = [
+            BufferDescription("nb_blocks", 1, numpy.uint32, None),
+            BufferDescription("block_position", self.num_blocks, numpy.uint64, None),
+            BufferDescription("cmp", self.cmp_size, numpy.uint8, None),
+            BufferDescription("dec", self.dec_size, self.dec_dtype, None),
+        ]
+
+        self.allocate_buffers(buffers, use_array=True)
+
+        self.compile_kernels([os.path.join("codec", "bitshuffle_lz4")])
+        self.block_size = min(
+            self.block_size,
+            kernel_workgroup_size(self.program, "bslz4_decompress_block"),
+        )
+
+    def decompress(self, raw, out=None, wg=None, nbytes=None):
+        """This function actually performs the decompression by calling the kernels
+        :param numpy.ndarray raw: The compressed data as a 1D numpy array of char or string
+        :param pyopencl.array out: pyopencl array in which to place the result.
+        :param wg: tuneable parameter with the workgroup size.
+        :param int nbytes: (Optional) Number of bytes occupied by the chunk in raw.
+        :return: The decompressed image as an pyopencl array.
+        :rtype: pyopencl.array
+        """
+
+        events = []
+        with self.sem:
+            if nbytes is not None:
+                assert nbytes <= raw.size
+                len_raw = numpy.uint64(nbytes)
+            elif isinstance(raw, pyopencl.Buffer):
+                len_raw = numpy.uint64(raw.size)
+            else:
+                len_raw = numpy.uint64(len(raw))
+
+            if isinstance(raw, pyopencl.array.Array):
+                cmp_buffer = raw.data
+                num_blocks = self.num_blocks
+            elif isinstance(raw, pyopencl.Buffer):
+                cmp_buffer = raw
+                num_blocks = self.num_blocks
+            else:
+                if len_raw > self.cmp_size:
+                    self.cmp_size = len_raw
+                    logger.info("increase cmp buffer size to %s", self.cmp_size)
+                    self.cl_mem["cmp"] = pyopencl.array.empty(
+                        self.queue, self.cmp_size, dtype=numpy.uint8
+                    )
+                evt = pyopencl.enqueue_copy(
+                    self.queue, self.cl_mem["cmp"].data, raw, is_blocking=False
+                )
+                events.append(EventDescription("copy raw H -> D", evt))
+                cmp_buffer = self.cl_mem["cmp"].data
+
+                dest_size = struct.unpack(">Q", raw[:8])
+                self_dest_nbyte = self.dec_size * self.dec_dtype.itemsize
+                if dest_size < self_dest_nbyte:
+                    num_blocks = numpy.uint32(
+                        (dest_size + self.LZ4_BLOCK_SIZE - 1) // self.LZ4_BLOCK_SIZE
+                    )
+                elif dest_size > self_dest_nbyte:
+                    num_blocks = numpy.uint32(
+                        (dest_size + self.LZ4_BLOCK_SIZE - 1) // self.LZ4_BLOCK_SIZE
+                    )
+                    self.cl_mem["dec"] = pyopencl.array.empty(
+                        self.queue, dest_size, self.dec_dtype
+                    )
+                    self.dec_size = dest_size // self.dec_dtype.itemsize
+                else:
+                    num_blocks = self.num_blocks
+
+            wg = int(wg or self.block_size)
+
+            evt = self.program.lz4_unblock(
+                self.queue,
+                (1,),
+                (1,),
+                cmp_buffer,
+                len_raw,
+                self.cl_mem["block_position"].data,
+                num_blocks,
+                self.cl_mem["nb_blocks"].data,
+            )
+            events.append(EventDescription("LZ4 unblock", evt))
+
+            if out is None:
+                out = self.cl_mem["dec"]
+            else:
+                assert out.dtype == self.dec_dtype
+                assert out.size == self.dec_size
+
+            evt = self.program.bslz4_decompress_block(
+                self.queue,
+                (self.num_blocks * wg,),
+                (wg,),
+                cmp_buffer,
+                out.data,
+                self.cl_mem["block_position"].data,
+                self.cl_mem["nb_blocks"].data,
+                numpy.uint8(self.dec_dtype.itemsize),
+            )
+            events.append(EventDescription("LZ4 decompress", evt))
+        self.profile_multi(events)
+        return out
+
+    __call__ = decompress
diff --git a/src/silx/opencl/codec/byte_offset.py b/src/silx/opencl/codec/byte_offset.py
index 9a52427..e3df9b2 100644
--- a/src/silx/opencl/codec/byte_offset.py
+++ b/src/silx/opencl/codec/byte_offset.py
@@ -1,10 +1,9 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 #
 #    Project: Sift implementation in Python + OpenCL
 #             https://github.com/silx-kit/silx
 #
-#    Copyright (C) 2013-2020  European Synchrotron Radiation Facility, Grenoble, France
+#    Copyright (C) 2013-2023  European Synchrotron Radiation Facility, Grenoble, France
 #
 # Permission is hereby granted, free of charge, to any person
 # obtaining a copy of this software and associated documentation
@@ -31,8 +30,6 @@
 This module provides a class for CBF byte offset compression/decompression.
 """
 
-from __future__ import division, print_function, with_statement
-
 __authors__ = ["Jérôme Kieffer"]
 __contact__ = "jerome.kieffer@esrf.eu"
 __license__ = "MIT"
@@ -48,10 +45,12 @@ from ..common import ocl, pyopencl
 from ..processing import BufferDescription, EventDescription, OpenclProcessing
 
 import logging
+
 logger = logging.getLogger(__name__)
 
 if pyopencl:
     import pyopencl.version
+
     if pyopencl.version.VERSION < (2016, 0):
         from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel
     else:
@@ -64,23 +63,36 @@ else:
 class ByteOffset(OpenclProcessing):
     """Perform the byte offset compression/decompression on the GPU
 
-        See :class:`OpenclProcessing` for optional arguments description.
-
-        :param int raw_size:
-            Size of the raw stream for decompression.
-            It can be (slightly) larger than the array.
-        :param int dec_size:
-            Size of the decompression output array
-            (mandatory for decompression)
-        """
-
-    def __init__(self, raw_size=None, dec_size=None,
-                 ctx=None, devicetype="all",
-                 platformid=None, deviceid=None,
-                 block_size=None, profile=False):
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  block_size=block_size, profile=profile)
+    See :class:`OpenclProcessing` for optional arguments description.
+
+    :param int raw_size:
+        Size of the raw stream for decompression.
+        It can be (slightly) larger than the array.
+    :param int dec_size:
+        Size of the decompression output array
+        (mandatory for decompression)
+    """
+
+    def __init__(
+        self,
+        raw_size=None,
+        dec_size=None,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        block_size=None,
+        profile=False,
+    ):
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            block_size=block_size,
+            profile=profile,
+        )
         if self.block_size is None:
             self.block_size = self.device.max_work_group_size
         wg = self.block_size
@@ -97,7 +109,9 @@ class ByteOffset(OpenclProcessing):
                 BufferDescription("raw", self.padded_raw_size, numpy.int8, None),
                 BufferDescription("mask", self.padded_raw_size, numpy.int32, None),
                 BufferDescription("values", self.padded_raw_size, numpy.int32, None),
-                BufferDescription("exceptions", self.padded_raw_size, numpy.int32, None)
+                BufferDescription(
+                    "exceptions", self.padded_raw_size, numpy.int32, None
+                ),
             ]
 
         if dec_size is None:
@@ -106,18 +120,17 @@ class ByteOffset(OpenclProcessing):
             self.dec_size = numpy.int32(dec_size)
             buffers += [
                 BufferDescription("data_float", self.dec_size, numpy.float32, None),
-                BufferDescription("data_int", self.dec_size, numpy.int32, None)
+                BufferDescription("data_int", self.dec_size, numpy.int32, None),
             ]
 
         self.allocate_buffers(buffers, use_array=True)
 
         self.compile_kernels([os.path.join("codec", "byte_offset")])
         self.kernels.__setattr__("scan", self._init_double_scan())
-        self.kernels.__setattr__("compression_scan",
-                                 self._init_compression_scan())
+        self.kernels.__setattr__("compression_scan", self._init_compression_scan())
 
     def _init_double_scan(self):
-        """"generates a double scan on indexes and values in one operation"""
+        """generates a double scan on indexes and values in one operation"""
         arguments = "__global int *value", "__global int *index"
         int2 = pyopencl.tools.get_or_register_dtype("int2")
         input_expr = "index[i]>0 ? (int2)(0, 0) : (int2)(value[i], 1)"
@@ -126,21 +139,25 @@ class ByteOffset(OpenclProcessing):
         output_statement = "value[i] = item.s0; index[i+1] = item.s1;"
 
         if self.block_size > 256:
-            knl = GenericScanKernel(self.ctx,
-                                    dtype=int2,
-                                    arguments=arguments,
-                                    input_expr=input_expr,
-                                    scan_expr=scan_expr,
-                                    neutral=neutral,
-                                    output_statement=output_statement)
+            knl = GenericScanKernel(
+                self.ctx,
+                dtype=int2,
+                arguments=arguments,
+                input_expr=input_expr,
+                scan_expr=scan_expr,
+                neutral=neutral,
+                output_statement=output_statement,
+            )
         else:  # MacOS on CPU
-            knl = GenericDebugScanKernel(self.ctx,
-                                         dtype=int2,
-                                         arguments=arguments,
-                                         input_expr=input_expr,
-                                         scan_expr=scan_expr,
-                                         neutral=neutral,
-                                         output_statement=output_statement)
+            knl = GenericDebugScanKernel(
+                self.ctx,
+                dtype=int2,
+                arguments=arguments,
+                input_expr=input_expr,
+                scan_expr=scan_expr,
+                neutral=neutral,
+                output_statement=output_statement,
+            )
         return knl
 
     def decode(self, raw, as_float=False, out=None):
@@ -153,8 +170,9 @@ class ByteOffset(OpenclProcessing):
         :return: The decompressed image as an pyopencl array.
         :rtype: pyopencl.array
         """
-        assert self.dec_size is not None, \
-            "dec_size is a mandatory ByteOffset init argument for decompression"
+        assert (
+            self.dec_size is not None
+        ), "dec_size is a mandatory ByteOffset init argument for decompression"
 
         events = []
         with self.sem:
@@ -165,67 +183,96 @@ class ByteOffset(OpenclProcessing):
                 self.padded_raw_size = (self.raw_size + wg - 1) & ~(wg - 1)
                 logger.info("increase raw buffer size to %s", self.padded_raw_size)
                 buffers = {
-                           "raw": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int8),
-                           "mask": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
-                           "exceptions": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
-                           "values": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
-                          }
+                    "raw": pyopencl.array.empty(
+                        self.queue, self.padded_raw_size, dtype=numpy.int8
+                    ),
+                    "mask": pyopencl.array.empty(
+                        self.queue, self.padded_raw_size, dtype=numpy.int32
+                    ),
+                    "exceptions": pyopencl.array.empty(
+                        self.queue, self.padded_raw_size, dtype=numpy.int32
+                    ),
+                    "values": pyopencl.array.empty(
+                        self.queue, self.padded_raw_size, dtype=numpy.int32
+                    ),
+                }
                 self.cl_mem.update(buffers)
             else:
                 wg = self.block_size
 
-            evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data,
-                                        raw,
-                                        is_blocking=False)
+            evt = pyopencl.enqueue_copy(
+                self.queue, self.cl_mem["raw"].data, raw, is_blocking=False
+            )
             events.append(EventDescription("copy raw H -> D", evt))
-            evt = self.kernels.fill_int_mem(self.queue, (self.padded_raw_size,), (wg,),
-                                            self.cl_mem["mask"].data,
-                                            numpy.int32(self.padded_raw_size),
-                                            numpy.int32(0),
-                                            numpy.int32(0))
+            evt = self.kernels.fill_int_mem(
+                self.queue,
+                (self.padded_raw_size,),
+                (wg,),
+                self.cl_mem["mask"].data,
+                numpy.int32(self.padded_raw_size),
+                numpy.int32(0),
+                numpy.int32(0),
+            )
             events.append(EventDescription("memset mask", evt))
-            evt = self.kernels.fill_int_mem(self.queue, (1,), (1,),
-                                            self.cl_mem["counter"].data,
-                                            numpy.int32(1),
-                                            numpy.int32(0),
-                                            numpy.int32(0))
+            evt = self.kernels.fill_int_mem(
+                self.queue,
+                (1,),
+                (1,),
+                self.cl_mem["counter"].data,
+                numpy.int32(1),
+                numpy.int32(0),
+                numpy.int32(0),
+            )
             events.append(EventDescription("memset counter", evt))
-            evt = self.kernels.mark_exceptions(self.queue, (self.padded_raw_size,), (wg,),
-                                               self.cl_mem["raw"].data,
-                                               len_raw,
-                                               numpy.int32(self.raw_size),
-                                               self.cl_mem["mask"].data,
-                                               self.cl_mem["values"].data,
-                                               self.cl_mem["counter"].data,
-                                               self.cl_mem["exceptions"].data)
+            evt = self.kernels.mark_exceptions(
+                self.queue,
+                (self.padded_raw_size,),
+                (wg,),
+                self.cl_mem["raw"].data,
+                len_raw,
+                numpy.int32(self.raw_size),
+                self.cl_mem["mask"].data,
+                self.cl_mem["values"].data,
+                self.cl_mem["counter"].data,
+                self.cl_mem["exceptions"].data,
+            )
             events.append(EventDescription("mark exceptions", evt))
             nb_exceptions = numpy.empty(1, dtype=numpy.int32)
-            evt = pyopencl.enqueue_copy(self.queue, nb_exceptions, self.cl_mem["counter"].data,
-                                        is_blocking=False)
+            evt = pyopencl.enqueue_copy(
+                self.queue,
+                nb_exceptions,
+                self.cl_mem["counter"].data,
+                is_blocking=False,
+            )
             events.append(EventDescription("copy counter D -> H", evt))
             evt.wait()
             nbexc = int(nb_exceptions[0])
             if nbexc == 0:
                 logger.info("nbexc %i", nbexc)
             else:
-                evt = self.kernels.treat_exceptions(self.queue, (nbexc,), (1,),
-                                                    self.cl_mem["raw"].data,
-                                                    len_raw,
-                                                    self.cl_mem["mask"].data,
-                                                    self.cl_mem["exceptions"].data,
-                                                    self.cl_mem["values"].data
-                                                    )
+                evt = self.kernels.treat_exceptions(
+                    self.queue,
+                    (nbexc,),
+                    (1,),
+                    self.cl_mem["raw"].data,
+                    len_raw,
+                    self.cl_mem["mask"].data,
+                    self.cl_mem["exceptions"].data,
+                    self.cl_mem["values"].data,
+                )
                 events.append(EventDescription("treat_exceptions", evt))
 
-            #self.cl_mem["copy_values"] = self.cl_mem["values"].copy()
-            #self.cl_mem["copy_mask"] = self.cl_mem["mask"].copy()
-            evt = self.kernels.scan(self.cl_mem["values"],
-                                    self.cl_mem["mask"],
-                                    queue=self.queue,
-                                    size=int(len_raw),
-                                    wait_for=(evt,))
+            # self.cl_mem["copy_values"] = self.cl_mem["values"].copy()
+            # self.cl_mem["copy_mask"] = self.cl_mem["mask"].copy()
+            evt = self.kernels.scan(
+                self.cl_mem["values"],
+                self.cl_mem["mask"],
+                queue=self.queue,
+                size=int(len_raw),
+                wait_for=(evt,),
+            )
             events.append(EventDescription("double scan", evt))
-            #evt.wait()
+            # evt.wait()
             if out is not None:
                 if out.dtype == numpy.float32:
                     copy_results = self.kernels.copy_result_float
@@ -238,15 +285,18 @@ class ByteOffset(OpenclProcessing):
                 else:
                     out = self.cl_mem["data_int"]
                     copy_results = self.kernels.copy_result_int
-            evt = copy_results(self.queue, (self.padded_raw_size,), (wg,),
-                               self.cl_mem["values"].data,
-                               self.cl_mem["mask"].data,
-                               len_raw,
-                               self.dec_size,
-                               out.data
-                               )
+            evt = copy_results(
+                self.queue,
+                (self.padded_raw_size,),
+                (wg,),
+                self.cl_mem["values"].data,
+                self.cl_mem["mask"].data,
+                len_raw,
+                self.dec_size,
+                out.data,
+            )
             events.append(EventDescription("copy_results", evt))
-            #evt.wait()
+            # evt.wait()
             if self.profile:
                 self.events += events
         return out
@@ -294,7 +344,9 @@ class ByteOffset(OpenclProcessing):
             }
         }
         """
-        arguments = "__global const int *data, __global char *compressed, __global int *size"
+        arguments = (
+            "__global const int *data, __global char *compressed, __global int *size"
+        )
         input_expr = "compressed_size((i == 0) ? data[0] : (data[i] - data[i - 1]))"
         scan_expr = "a+b"
         neutral = "0"
@@ -306,23 +358,27 @@ class ByteOffset(OpenclProcessing):
         """
 
         if self.block_size >= 64:
-            knl = GenericScanKernel(self.ctx,
-                                    dtype=numpy.int32,
-                                    preamble=preamble,
-                                    arguments=arguments,
-                                    input_expr=input_expr,
-                                    scan_expr=scan_expr,
-                                    neutral=neutral,
-                                    output_statement=output_statement)
+            knl = GenericScanKernel(
+                self.ctx,
+                dtype=numpy.int32,
+                preamble=preamble,
+                arguments=arguments,
+                input_expr=input_expr,
+                scan_expr=scan_expr,
+                neutral=neutral,
+                output_statement=output_statement,
+            )
         else:  # MacOS on CPU
-            knl = GenericDebugScanKernel(self.ctx,
-                                         dtype=numpy.int32,
-                                         preamble=preamble,
-                                         arguments=arguments,
-                                         input_expr=input_expr,
-                                         scan_expr=scan_expr,
-                                         neutral=neutral,
-                                         output_statement=output_statement)
+            knl = GenericDebugScanKernel(
+                self.ctx,
+                dtype=numpy.int32,
+                preamble=preamble,
+                arguments=arguments,
+                input_expr=input_expr,
+                scan_expr=scan_expr,
+                neutral=neutral,
+                output_statement=output_statement,
+            )
         return knl
 
     def encode(self, data, out=None):
@@ -351,28 +407,39 @@ class ByteOffset(OpenclProcessing):
                 data = numpy.ascontiguousarray(data, dtype=numpy.int32).ravel()
 
                 # Make sure data array exists and is large enough
-                if ("data_input" not in self.cl_mem or
-                        self.cl_mem["data_input"].size < data.size):
+                if (
+                    "data_input" not in self.cl_mem
+                    or self.cl_mem["data_input"].size < data.size
+                ):
                     logger.info("increase data input buffer size to %s", data.size)
-                    self.cl_mem.update({
-                        "data_input": pyopencl.array.empty(self.queue,
-                                                           data.size,
-                                                           dtype=numpy.int32)})
+                    self.cl_mem.update(
+                        {
+                            "data_input": pyopencl.array.empty(
+                                self.queue, data.size, dtype=numpy.int32
+                            )
+                        }
+                    )
                 d_data = self.cl_mem["data_input"]
 
                 evt = pyopencl.enqueue_copy(
-                    self.queue, d_data.data, data, is_blocking=False)
+                    self.queue, d_data.data, data, is_blocking=False
+                )
                 events.append(EventDescription("copy data H -> D", evt))
 
             # Make sure compressed array exists and is large enough
             compressed_size = d_data.size * 7
-            if ("compressed" not in self.cl_mem or
-                    self.cl_mem["compressed"].size < compressed_size):
+            if (
+                "compressed" not in self.cl_mem
+                or self.cl_mem["compressed"].size < compressed_size
+            ):
                 logger.info("increase compressed buffer size to %s", compressed_size)
-                self.cl_mem.update({
-                    "compressed": pyopencl.array.empty(self.queue,
-                                                       compressed_size,
-                                                       dtype=numpy.int8)})
+                self.cl_mem.update(
+                    {
+                        "compressed": pyopencl.array.empty(
+                            self.queue, compressed_size, dtype=numpy.int8
+                        )
+                    }
+                )
             d_compressed = self.cl_mem["compressed"]
             d_size = self.cl_mem["counter"]  # Shared with decompression
 
@@ -387,13 +454,15 @@ class ByteOffset(OpenclProcessing):
                     shape=(byte_count,),
                     dtype=numpy.int8,
                     allocator=functools.partial(
-                        d_compressed.base_data.get_sub_region,
-                        d_compressed.offset))
+                        d_compressed.base_data.get_sub_region, d_compressed.offset
+                    ),
+                )
 
             elif out.size < byte_count:
                 raise ValueError(
                     "Provided output buffer is not large enough: "
-                    "requires %d bytes, got %d" % (byte_count, out.size))
+                    "requires %d bytes, got %d" % (byte_count, out.size)
+                )
 
             else:  # out.size >= byte_count
                 # Create an array with a sub-region of out and this class queue
@@ -401,13 +470,15 @@ class ByteOffset(OpenclProcessing):
                     self.queue,
                     shape=(byte_count,),
                     dtype=numpy.int8,
-                    allocator=functools.partial(out.base_data.get_sub_region,
-                                                out.offset))
+                    allocator=functools.partial(
+                        out.base_data.get_sub_region, out.offset
+                    ),
+                )
 
-                evt = pyopencl.enqueue_copy(self.queue, out.data, d_compressed.data,
-                                            byte_count=byte_count)
-                events.append(
-                    EventDescription("copy D -> D: internal -> out", evt))
+                evt = pyopencl.enqueue_copy(
+                    self.queue, out.data, d_compressed.data, byte_count=byte_count
+                )
+                events.append(EventDescription("copy D -> D: internal -> out", evt))
 
             if self.profile:
                 self.events += events
diff --git a/src/silx/opencl/codec/setup.py b/src/silx/opencl/codec/setup.py
deleted file mode 100644
index 4a5c1e5..0000000
--- a/src/silx/opencl/codec/setup.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# coding: utf-8
-#
-# Copyright (C) 2016-2017 European Synchrotron Radiation Facility
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-#
-
-from __future__ import division
-
-__contact__ = "jerome.kieffer@esrf.eu"
-__license__ = "MIT"
-__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
-__authors__ = ["J. Kieffer"]
-__date__ = "13/10/2017"
-
-from numpy.distutils.misc_util import Configuration
-
-
-def configuration(parent_package='', top_path=None):
-    config = Configuration('codec', parent_package, top_path)
-    config.add_subpackage('test')
-    return config
-
-
-if __name__ == "__main__":
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/src/silx/opencl/codec/test/__init__.py b/src/silx/opencl/codec/test/__init__.py
index 325c2c7..45065f8 100644
--- a/src/silx/opencl/codec/test/__init__.py
+++ b/src/silx/opencl/codec/test/__init__.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 #
 #    Project: silx
 #             https://github.com/silx-kit/silx
diff --git a/src/silx/opencl/codec/test/test_bitshuffle_lz4.py b/src/silx/opencl/codec/test/test_bitshuffle_lz4.py
new file mode 100644
index 0000000..6c5891e
--- /dev/null
+++ b/src/silx/opencl/codec/test/test_bitshuffle_lz4.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+#
+#    Project: Bitshuffle-LZ4 decompression in OpenCL
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2022-2023  European Synchrotron Radiation Facility,
+#                             Grenoble, France
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Test suite for byte-offset decompression 
+"""
+
+__authors__ = ["Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2022 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "07/11/2022"
+
+import struct
+import numpy
+import pytest
+
+try:
+    import bitshuffle
+except ImportError:
+    bitshuffle = None
+from silx.opencl.common import ocl, pyopencl
+from silx.opencl.codec.bitshuffle_lz4 import BitshuffleLz4
+
+
+TESTCASES = (  # dtype, shape
+    ("uint64", (103, 503)),
+    ("int64", (101, 509)),
+    ("uint32", (229, 659)),
+    ("int32", (233, 653)),
+    ("uint16", (743, 647)),
+    ("int16", (751, 643)),
+    ("uint8", (157, 1373)),
+    ("int8", (163, 1367)),
+)
+
+
+@pytest.mark.skipif(
+    not ocl or not pyopencl or bitshuffle is None,
+    reason="PyOpenCl or bitshuffle is missing",
+)
+class TestBitshuffleLz4:
+    """Test pyopencl bishuffle+LZ4 decompression"""
+
+    @staticmethod
+    def _create_test_data(shape, lam=100, dtype="uint32"):
+        """Create test (image, compressed stream) pair.
+
+        :param shape: Shape of test image
+        :param lam: Expectation of interval argument for numpy.random.poisson
+        :return: (reference image array, compressed stream)
+        """
+        ref = numpy.random.poisson(lam, size=shape).astype(dtype)
+        raw = (
+            struct.pack(">Q", ref.nbytes)
+            + b"\x00" * 4
+            + bitshuffle.compress_lz4(ref).tobytes()
+        )
+        return ref, raw
+
+    @pytest.mark.parametrize("dtype,shape", TESTCASES)
+    def test_decompress(self, dtype, shape):
+        """
+        Tests the byte offset decompression on GPU with various configuration
+        """
+        ref, raw = self._create_test_data(shape=shape, dtype=dtype)
+        bs = BitshuffleLz4(len(raw), numpy.prod(shape), dtype=dtype)
+        res = bs.decompress(raw).get()
+        assert numpy.array_equal(res, ref.ravel()), "Checks decompression works"
+
+    @pytest.mark.parametrize("dtype,shape", TESTCASES)
+    def test_decompress_from_buffer(self, dtype, shape):
+        """Test reading compressed data from pyopencl Buffer"""
+        ref, raw = self._create_test_data(shape=shape, dtype=dtype)
+
+        bs = BitshuffleLz4(0, numpy.prod(shape), dtype=dtype)
+
+        buffer = pyopencl.Buffer(
+            bs.ctx,
+            flags=pyopencl.mem_flags.COPY_HOST_PTR | pyopencl.mem_flags.READ_ONLY,
+            hostbuf=raw,
+        )
+
+        res = bs.decompress(buffer).get()
+        assert numpy.array_equal(res, ref.ravel()), "Checks decompression works"
+
+    @pytest.mark.parametrize("dtype,shape", TESTCASES)
+    def test_decompress_from_array(self, dtype, shape):
+        """Test reading compressed data from pyopencl Array"""
+        ref, raw = self._create_test_data(shape=shape, dtype=dtype)
+
+        bs = BitshuffleLz4(0, numpy.prod(shape), dtype=dtype)
+
+        array = pyopencl.array.to_device(
+            bs.queue,
+            numpy.frombuffer(raw, dtype=numpy.uint8),
+            array_queue=bs.queue,
+        )
+
+        res = bs.decompress(array).get()
+        assert numpy.array_equal(res, ref.ravel()), "Checks decompression works"
diff --git a/src/silx/opencl/codec/test/test_byte_offset.py b/src/silx/opencl/codec/test/test_byte_offset.py
index 4b2d5a3..0e58076 100644
--- a/src/silx/opencl/codec/test/test_byte_offset.py
+++ b/src/silx/opencl/codec/test/test_byte_offset.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 #
 #    Project: Byte-offset decompression in OpenCL
 #             https://github.com/silx-kit/silx
@@ -31,8 +30,6 @@
 Test suite for byte-offset decompression 
 """
 
-from __future__ import division, print_function
-
 __authors__ = ["Jérôme Kieffer"]
 __contact__ = "jerome.kieffer@esrf.eu"
 __license__ = "MIT"
@@ -47,13 +44,12 @@ from silx.opencl.common import ocl, pyopencl
 from silx.opencl.codec import byte_offset
 import fabio
 import unittest
+
 logger = logging.getLogger(__name__)
 
 
-@unittest.skipUnless(ocl and pyopencl,
-                     "PyOpenCl is missing")
+@unittest.skipUnless(ocl and pyopencl, "PyOpenCl is missing")
 class TestByteOffset(unittest.TestCase):
-
     @staticmethod
     def _create_test_data(shape, nexcept, lam=200):
         """Create test (image, compressed stream) pair.
@@ -87,7 +83,9 @@ class TestByteOffset(unittest.TestCase):
         except (RuntimeError, pyopencl.RuntimeError) as err:
             logger.warning(err)
             if sys.platform == "darwin":
-                raise unittest.SkipTest("Byte-offset decompression is known to be buggy on MacOS-CPU")
+                raise unittest.SkipTest(
+                    "Byte-offset decompression is known to be buggy on MacOS-CPU"
+                )
             else:
                 raise err
         print(bo.block_size)
@@ -100,9 +98,11 @@ class TestByteOffset(unittest.TestCase):
         delta_cy = abs(ref.ravel() - res_cy).max()
         delta_cl = abs(ref.ravel() - res_cl.get()).max()
 
-        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
-                     1000.0 * (t1 - t0),
-                     1000.0 * (t2 - t1))
+        logger.debug(
+            "Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+            1000.0 * (t1 - t0),
+            1000.0 * (t2 - t1),
+        )
         bo.log_profile()
         # print(ref)
         # print(res_cl.get())
@@ -111,8 +111,8 @@ class TestByteOffset(unittest.TestCase):
 
     def test_many_decompress(self, ntest=10):
         """
-        tests the byte offset decompression on GPU, many images to ensure there 
-        is not leaking in memory 
+        tests the byte offset decompression on GPU, many images to ensure there
+        is not leaking in memory
         """
         shape = (991, 997)
         size = numpy.prod(shape)
@@ -123,7 +123,9 @@ class TestByteOffset(unittest.TestCase):
         except (RuntimeError, pyopencl.RuntimeError) as err:
             logger.warning(err)
             if sys.platform == "darwin":
-                raise unittest.SkipTest("Byte-offset decompression is known to be buggy on MacOS-CPU")
+                raise unittest.SkipTest(
+                    "Byte-offset decompression is known to be buggy on MacOS-CPU"
+                )
             else:
                 raise err
         t0 = time.time()
@@ -135,9 +137,11 @@ class TestByteOffset(unittest.TestCase):
         delta_cl = abs(ref.ravel() - res_cl.get()).max()
         self.assertEqual(delta_cy, 0, "Checks fabio works")
         self.assertEqual(delta_cl, 0, "Checks opencl works")
-        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
-                     1000.0 * (t1 - t0),
-                     1000.0 * (t2 - t1))
+        logger.debug(
+            "Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+            1000.0 * (t1 - t0),
+            1000.0 * (t2 - t1),
+        )
 
         for i in range(ntest):
             ref, raw = self._create_test_data(shape=shape, nexcept=2729, lam=200)
@@ -152,9 +156,11 @@ class TestByteOffset(unittest.TestCase):
             self.assertEqual(delta_cy, 0, "Checks fabio works #%i" % i)
             self.assertEqual(delta_cl, 0, "Checks opencl works #%i" % i)
 
-            logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
-                         1000.0 * (t1 - t0),
-                         1000.0 * (t2 - t1))
+            logger.debug(
+                "Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                1000.0 * (t1 - t0),
+                1000.0 * (t2 - t1),
+            )
         bo.log_profile(stats=True)
 
     def test_encode(self):
@@ -174,8 +180,7 @@ class TestByteOffset(unittest.TestCase):
         compressed_stream = compressed_array.get().tobytes()
         self.assertEqual(raw, compressed_stream)
 
-        logger.debug("Global execution time: OpenCL: %.3fms.",
-                     1000.0 * (t1 - t0))
+        logger.debug("Global execution time: OpenCL: %.3fms.", 1000.0 * (t1 - t0))
         bo.log_profile()
 
     def test_encode_to_array(self):
@@ -225,14 +230,15 @@ class TestByteOffset(unittest.TestCase):
 
         self.assertEqual(raw, compressed_stream)
 
-        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
-                     1000.0 * (t1 - t0),
-                     1000.0 * (t2 - t1))
+        logger.debug(
+            "Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+            1000.0 * (t1 - t0),
+            1000.0 * (t2 - t1),
+        )
         bo.log_profile()
 
     def test_encode_to_bytes_from_array(self):
-        """Test byte offset compression to bytes from a pyopencl array.
-        """
+        """Test byte offset compression to bytes from a pyopencl array."""
         ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729)
 
         try:
@@ -241,8 +247,7 @@ class TestByteOffset(unittest.TestCase):
             logger.warning(err)
             raise err
 
-        d_ref = pyopencl.array.to_device(
-            bo.queue, ref.astype(numpy.int32).ravel())
+        d_ref = pyopencl.array.to_device(bo.queue, ref.astype(numpy.int32).ravel())
 
         t0 = time.time()
         res_fabio = fabio.compression.compByteOffset(ref)
@@ -252,9 +257,11 @@ class TestByteOffset(unittest.TestCase):
 
         self.assertEqual(raw, compressed_stream)
 
-        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
-                     1000.0 * (t1 - t0),
-                     1000.0 * (t2 - t1))
+        logger.debug(
+            "Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+            1000.0 * (t1 - t0),
+            1000.0 * (t2 - t1),
+        )
         bo.log_profile()
 
     def test_many_encode(self, ntest=10):
@@ -278,9 +285,11 @@ class TestByteOffset(unittest.TestCase):
         bo_durations.append(1000.0 * (t2 - t1))
 
         self.assertEqual(raw, compressed_stream)
-        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
-                     1000.0 * (t1 - t0),
-                     1000.0 * (t2 - t1))
+        logger.debug(
+            "Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+            1000.0 * (t1 - t0),
+            1000.0 * (t2 - t1),
+        )
 
         for i in range(ntest):
             ref, raw = self._create_test_data(shape=shape, nexcept=2729, lam=200)
@@ -293,11 +302,15 @@ class TestByteOffset(unittest.TestCase):
             bo_durations.append(1000.0 * (t2 - t1))
 
             self.assertEqual(raw, compressed_stream)
-            logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
-                         1000.0 * (t1 - t0),
-                         1000.0 * (t2 - t1))
-
-        logger.debug("OpenCL execution time: Mean: %fms, Min: %fms, Max: %fms",
-                     numpy.mean(bo_durations),
-                     numpy.min(bo_durations),
-                     numpy.max(bo_durations))
+            logger.debug(
+                "Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                1000.0 * (t1 - t0),
+                1000.0 * (t2 - t1),
+            )
+
+        logger.debug(
+            "OpenCL execution time: Mean: %fms, Min: %fms, Max: %fms",
+            numpy.mean(bo_durations),
+            numpy.min(bo_durations),
+            numpy.max(bo_durations),
+        )
diff --git a/src/silx/opencl/common.py b/src/silx/opencl/common.py
index 888b1da..30c9ef7 100644
--- a/src/silx/opencl/common.py
+++ b/src/silx/opencl/common.py
@@ -1,10 +1,9 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 #
 #    Project: S I L X project
 #             https://github.com/silx-kit/silx
 #
-#    Copyright (C) 2012-2021 European Synchrotron Radiation Facility, Grenoble, France
+#    Copyright (C) 2012-2023 European Synchrotron Radiation Facility, Grenoble, France
 #
 #    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
 #
@@ -34,10 +33,17 @@ __author__ = "Jerome Kieffer"
 __contact__ = "Jerome.Kieffer@ESRF.eu"
 __license__ = "MIT"
 __copyright__ = "2012-2017 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "29/09/2021"
+__date__ = "09/09/2023"
 __status__ = "stable"
-__all__ = ["ocl", "pyopencl", "mf", "release_cl_buffers", "allocate_cl_buffers",
-           "measure_workgroup_size", "kernel_workgroup_size"]
+__all__ = [
+    "ocl",
+    "pyopencl",
+    "mf",
+    "release_cl_buffers",
+    "allocate_cl_buffers",
+    "measure_workgroup_size",
+    "kernel_workgroup_size",
+]
 
 import os
 import logging
@@ -49,60 +55,69 @@ from .utils import get_opencl_code
 logger = logging.getLogger(__name__)
 
 if os.environ.get("SILX_OPENCL") in ["0", "False"]:
-    logger.info("Use of OpenCL has been disabled from environment variable: SILX_OPENCL=0")
+    logger.info(
+        "Use of OpenCL has been disabled from environment variable: SILX_OPENCL=0"
+    )
     pyopencl = None
 else:
     try:
         import pyopencl
     except ImportError:
-        logger.warning("Unable to import pyOpenCl. Please install it from: https://pypi.org/project/pyopencl")
+        logger.warning(
+            "Unable to import pyOpenCl. Please install it from: https://pypi.org/project/pyopencl"
+        )
         pyopencl = None
     else:
         try:
             pyopencl.get_platforms()
         except pyopencl.LogicError:
-            logger.warning("The module pyOpenCL has been imported but can't be used here")
+            logger.warning(
+                "The module pyOpenCL has been imported but can't be used here"
+            )
             pyopencl = None
-        else:
-            import pyopencl.array as array
-            mf = pyopencl.mem_flags
 
-if pyopencl is None:
+if pyopencl is not None:
+    import pyopencl.array as array
 
+    mf = pyopencl.mem_flags
+    from .atomic import check_atomic32, check_atomic64
+else:
     # Define default mem flags
     class mf(object):
         WRITE_ONLY = 1
         READ_ONLY = 1
         READ_WRITE = 1
-else:
-    mf = pyopencl.mem_flags
 
-FLOP_PER_CORE = {"GPU": 64,  # GPU, Fermi at least perform 64 flops per cycle/multicore, G80 were at 24 or 48 ...
-                 "CPU": 4,  # CPU, at least intel's have 4 operation per cycle
-                 "ACC": 8}  # ACC: the Xeon-phi (MIC) appears to be able to process 8 Flops per hyperthreaded-core
+
+FLOP_PER_CORE = {
+    "GPU": 64,  # GPU, Fermi at least perform 64 flops per cycle/multicore, G80 were at 24 or 48 ...
+    "CPU": 4,  # CPU, at least intel's have 4 operation per cycle
+    "ACC": 8,
+}  # ACC: the Xeon-phi (MIC) appears to be able to process 8 Flops per hyperthreaded-core
 
 # Sources : https://en.wikipedia.org/wiki/CUDA
-NVIDIA_FLOP_PER_CORE = {(1, 0): 24,   # Guessed !
-                        (1, 1): 24,   # Measured on G98 [Quadro NVS 295]
-                        (1, 2): 24,   # Guessed !
-                        (1, 3): 24,   # measured on a GT285 (GT200)
-                        (2, 0): 64,   # Measured on a 580 (GF110)
-                        (2, 1): 96,   # Measured on Quadro2000 GF106GL
-                        (3, 0): 384,  # Guessed!
-                        (3, 5): 384,  # Measured on K20
-                        (3, 7): 384,  # K80: Guessed!
-                        (5, 0): 256,  # Maxwell 4 warps/SM 2 flops/ CU
-                        (5, 2): 256,  # Titan-X
-                        (5, 3): 256,  # TX1
-                        (6, 0): 128,  # GP100
-                        (6, 1): 128,  # GP104
-                        (6, 2): 128,  # ?
-                        (7, 0): 128,  # Volta  # measured on Telsa V100
-                        (7, 2): 128,  # Volta  ? 
-                        (7, 5): 128,  # Turing # measured on RTX 6000
-                        (8, 0): 128,  # Ampere # measured on Tesla A100
-                        (8, 6): 256,  # Ampere # measured on RTX A5000
-                        }
+NVIDIA_FLOP_PER_CORE = {
+    (1, 0): 24,  # Guessed !
+    (1, 1): 24,  # Measured on G98 [Quadro NVS 295]
+    (1, 2): 24,  # Guessed !
+    (1, 3): 24,  # measured on a GT285 (GT200)
+    (2, 0): 64,  # Measured on a 580 (GF110)
+    (2, 1): 96,  # Measured on Quadro2000 GF106GL
+    (3, 0): 384,  # Guessed!
+    (3, 5): 384,  # Measured on K20
+    (3, 7): 384,  # K80: Guessed!
+    (5, 0): 256,  # Maxwell 4 warps/SM 2 flops/ CU
+    (5, 2): 256,  # Titan-X
+    (5, 3): 256,  # TX1
+    (6, 0): 128,  # GP100
+    (6, 1): 128,  # GP104
+    (6, 2): 128,  # ?
+    (7, 0): 128,  # Volta  # measured on Telsa V100
+    (7, 2): 128,  # Volta  ?
+    (7, 5): 128,  # Turing # measured on RTX 6000
+    (8, 0): 128,  # Ampere # measured on Tesla A100
+    (8, 6): 256,  # Ampere # measured on RTX A5000
+}
 
 AMD_FLOP_PER_CORE = 160  # Measured on a M7820 10 core, 700MHz 1120GFlops
 
@@ -112,9 +127,24 @@ class Device(object):
     Simple class that contains the structure of an OpenCL device
     """
 
-    def __init__(self, name="None", dtype=None, version=None, driver_version=None,
-                 extensions="", memory=None, available=None,
-                 cores=None, frequency=None, flop_core=None, idx=0, workgroup=1):
+    def __init__(
+        self,
+        name="None",
+        dtype=None,
+        version=None,
+        driver_version=None,
+        extensions="",
+        memory=None,
+        available=None,
+        cores=None,
+        frequency=None,
+        flop_core=None,
+        idx=0,
+        workgroup=1,
+        atomic32=None,
+        atomic64=None,
+        platform=None,
+    ):
         """
         Simple container with some important data for the OpenCL device description.
 
@@ -130,6 +160,7 @@ class Device(object):
         :param flop_core: Flopating Point operation per core per cycle
         :param idx: index of the device within the platform
         :param workgroup: max workgroup size
+        :param platform: the platform to which this device is attached
         """
         self.name = name.strip()
         self.type = dtype
@@ -142,12 +173,15 @@ class Device(object):
         self.frequency = frequency
         self.id = idx
         self.max_work_group_size = workgroup
+        self.atomic32 = atomic32
+        self.atomic64 = atomic64
         if not flop_core:
             flop_core = FLOP_PER_CORE.get(dtype, 1)
         if cores and frequency:
             self.flops = cores * frequency * flop_core
         else:
             self.flops = flop_core
+        self.platform = platform
 
     def __repr__(self):
         return "%s" % self.name
@@ -158,19 +192,20 @@ class Device(object):
 
         :return: string
         """
-        lst = ["Name\t\t:\t%s" % self.name,
-               "Type\t\t:\t%s" % self.type,
-               "Memory\t\t:\t%.3f MB" % (self.memory / 2.0 ** 20),
-               "Cores\t\t:\t%s CU" % self.cores,
-               "Frequency\t:\t%s MHz" % self.frequency,
-               "Speed\t\t:\t%.3f GFLOPS" % (self.flops / 1000.),
-               "Version\t\t:\t%s" % self.version,
-               "Available\t:\t%s" % self.available]
+        lst = [
+            "Name\t\t:\t%s" % self.name,
+            "Type\t\t:\t%s" % self.type,
+            "Memory\t\t:\t%.3f MB" % (self.memory / 2.0**20),
+            "Cores\t\t:\t%s CU" % self.cores,
+            "Frequency\t:\t%s MHz" % self.frequency,
+            "Speed\t\t:\t%.3f GFLOPS" % (self.flops / 1000.0),
+            "Version\t\t:\t%s" % self.version,
+            "Available\t:\t%s" % self.available,
+        ]
         return os.linesep.join(lst)
 
     def set_unavailable(self):
-        """Use this method to flag a faulty device
-        """
+        """Use this method to flag a faulty device"""
         self.available = False
 
 
@@ -179,7 +214,9 @@ class Platform(object):
     Simple class that contains the structure of an OpenCL platform
     """
 
-    def __init__(self, name="None", vendor="None", version=None, extensions=None, idx=0):
+    def __init__(
+        self, name="None", vendor="None", version=None, extensions=None, idx=0
+    ):
         """
         Class containing all descriptions of a platform and all devices description within that platform.
 
@@ -205,6 +242,7 @@ class Platform(object):
 
         :param device: Device instance
         """
+        device.platform = self
         self.devices.append(device)
 
     def get_device(self, key):
@@ -245,19 +283,25 @@ def _measure_workgroup_size(device_or_context, fast=False):
             platformid = pyopencl.get_platforms().index(platform)
             deviceid = platform.get_devices().index(device_or_context)
             ocl.platforms[platformid].devices[deviceid].set_unavailable()
-            raise RuntimeError("Unable to create context on %s/%s: %s" % (platform, device_or_context, error))
+            raise RuntimeError(
+                "Unable to create context on %s/%s: %s"
+                % (platform, device_or_context, error)
+            )
         else:
             device = device_or_context
     elif isinstance(device_or_context, pyopencl.Context):
         ctx = device_or_context
         device = device_or_context.devices[0]
     elif isinstance(device_or_context, (tuple, list)) and len(device_or_context) == 2:
-        ctx = ocl.create_context(platformid=device_or_context[0],
-                                 deviceid=device_or_context[1])
+        ctx = ocl.create_context(
+            platformid=device_or_context[0], deviceid=device_or_context[1]
+        )
         device = ctx.devices[0]
     else:
-        raise RuntimeError("""given parameter device_or_context is not an
-            instanciation of a device or a context""")
+        raise RuntimeError(
+            """given parameter device_or_context is not an
+            instanciation of a device or a context"""
+        )
     shape = device.max_work_group_size
     # get the context
 
@@ -272,7 +316,9 @@ def _measure_workgroup_size(device_or_context, fast=False):
 
     program = pyopencl.Program(ctx, get_opencl_code("addition")).build()
     if fast:
-        max_valid_wg = program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, device)
+        max_valid_wg = program.addition.get_work_group_info(
+            pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, device
+        )
     else:
         maxi = int(round(numpy.log2(shape)))
         for i in range(maxi + 1):
@@ -280,11 +326,19 @@ def _measure_workgroup_size(device_or_context, fast=False):
             wg = 1 << i
             try:
                 evt = program.addition(
-                    queue, (shape,), (wg,),
-                    d_data.data, d_data_1.data, d_res.data, numpy.int32(shape))
+                    queue,
+                    (shape,),
+                    (wg,),
+                    d_data.data,
+                    d_data_1.data,
+                    d_res.data,
+                    numpy.int32(shape),
+                )
                 evt.wait()
             except Exception as error:
-                logger.info("%s on device %s for WG=%s/%s", error, device.name, wg, shape)
+                logger.info(
+                    "%s on device %s for WG=%s/%s", error, device.name, wg, shape
+                )
                 program = queue = d_res = d_data_1 = d_data = None
                 break
             else:
@@ -294,7 +348,9 @@ def _measure_workgroup_size(device_or_context, fast=False):
                     if wg > max_valid_wg:
                         max_valid_wg = wg
                 else:
-                    logger.warning("ArithmeticError on %s for WG=%s/%s", wg, device.name, shape)
+                    logger.warning(
+                        "ArithmeticError on %s for WG=%s/%s", wg, device.name, shape
+                    )
 
     return max_valid_wg
 
@@ -317,15 +373,25 @@ class OpenCL(object):
     if pyopencl:
         platform = device = pypl = devtype = extensions = pydev = None
         for idx, platform in enumerate(pyopencl.get_platforms()):
-            pypl = Platform(platform.name, platform.vendor, platform.version, platform.extensions, idx)
+            pypl = Platform(
+                platform.name,
+                platform.vendor,
+                platform.version,
+                platform.extensions,
+                idx,
+            )
             for idd, device in enumerate(platform.get_devices()):
                 ####################################################
                 # Nvidia does not report int64 atomics (we are using) ...
                 # this is a hack around as any nvidia GPU with double-precision supports int64 atomics
                 ####################################################
                 extensions = device.extensions
-                if (pypl.vendor == "NVIDIA Corporation") and ('cl_khr_fp64' in extensions):
-                    extensions += ' cl_khr_int64_base_atomics cl_khr_int64_extended_atomics'
+                if (pypl.vendor == "NVIDIA Corporation") and (
+                    "cl_khr_fp64" in extensions
+                ):
+                    extensions += (
+                        " cl_khr_int64_base_atomics cl_khr_int64_extended_atomics"
+                    )
                 try:
                     devtype = pyopencl.device_type.to_string(device.type).upper()
                 except ValueError:
@@ -340,14 +406,23 @@ class OpenCL(object):
                         devtype = "CPU"
                     else:
                         devtype = devtype[:3]
-                if _is_nvidia_gpu(device.vendor, devtype) and ("compute_capability_major_nv" in dir(device)):
+                if _is_nvidia_gpu(device.vendor, devtype) and (
+                    "compute_capability_major_nv" in dir(device)
+                ):
                     try:
-                        comput_cap = device.compute_capability_major_nv, device.compute_capability_minor_nv
+                        comput_cap = (
+                            device.compute_capability_major_nv,
+                            device.compute_capability_minor_nv,
+                        )
                     except pyopencl.LogicError:
                         flop_core = FLOP_PER_CORE["GPU"]
                     else:
-                        flop_core = NVIDIA_FLOP_PER_CORE.get(comput_cap, FLOP_PER_CORE["GPU"])
-                elif (pypl.vendor == "Advanced Micro Devices, Inc.") and (devtype == "GPU"):
+                        flop_core = NVIDIA_FLOP_PER_CORE.get(
+                            comput_cap, FLOP_PER_CORE["GPU"]
+                        )
+                elif (pypl.vendor == "Advanced Micro Devices, Inc.") and (
+                    devtype == "GPU"
+                ):
                     flop_core = AMD_FLOP_PER_CORE
                 elif devtype == "CPU":
                     flop_core = FLOP_PER_CORE.get(devtype, 1)
@@ -355,14 +430,29 @@ class OpenCL(object):
                     flop_core = 1
                 workgroup = device.max_work_group_size
                 if (devtype == "CPU") and (pypl.vendor == "Apple"):
-                    logger.info("For Apple's OpenCL on CPU: Measuring actual valid max_work_goup_size.")
+                    logger.info(
+                        "For Apple's OpenCL on CPU: Measuring actual valid max_work_goup_size."
+                    )
                     workgroup = _measure_workgroup_size(device, fast=True)
                 if (devtype == "GPU") and os.environ.get("GPU") == "False":
                     # Environment variable to disable GPU devices
                     continue
-                pydev = Device(device.name, devtype, device.version, device.driver_version, extensions,
-                               device.global_mem_size, bool(device.available), device.max_compute_units,
-                               device.max_clock_frequency, flop_core, idd, workgroup)
+                pydev = Device(
+                    device.name,
+                    devtype,
+                    device.version,
+                    device.driver_version,
+                    extensions,
+                    device.global_mem_size,
+                    bool(device.available),
+                    device.max_compute_units,
+                    device.max_clock_frequency,
+                    flop_core,
+                    idd,
+                    workgroup,
+                    check_atomic32(device)[0],
+                    check_atomic64(device)[0],
+                )
                 pypl.add_device(pydev)
                 nb_devices += 1
             platforms.append(pypl)
@@ -371,9 +461,11 @@ class OpenCL(object):
     def __repr__(self):
         out = ["OpenCL devices:"]
         for platformid, platform in enumerate(self.platforms):
-            deviceids = ["(%s,%s) %s" % (platformid, deviceid, dev.name)
-                         for deviceid, dev in enumerate(platform.devices)]
-            out.append("[%s] %s: " % (platformid, platform.name) + ", ".join(deviceids))
+            deviceids = [
+                f"({platformid},{deviceid}) {dev.name}"
+                for deviceid, dev in enumerate(platform.devices)
+            ]
+            out.append(f"[{platformid}] {platform.name}: " + ", ".join(deviceids))
         return os.linesep.join(out)
 
     def get_platform(self, key):
@@ -395,7 +487,9 @@ class OpenCL(object):
                 out = self.platforms[platid]
         return out
 
-    def select_device(self, dtype="ALL", memory=None, extensions=None, best=True, **kwargs):
+    def select_device(
+        self, dtype="ALL", memory=None, extensions=None, best=True, **kwargs
+    ):
         """
         Select a device based on few parameters (at the end, keep the one with most memory)
 
@@ -439,8 +533,15 @@ class OpenCL(object):
         # Nothing found
         return None
 
-    def create_context(self, devicetype="ALL", useFp64=False, platformid=None,
-                       deviceid=None, cached=True, memory=None, extensions=None):
+    def create_context(
+        self,
+        devicetype="ALL",
+        platformid=None,
+        deviceid=None,
+        cached=True,
+        memory=None,
+        extensions=None,
+    ):
         """
         Choose a device and initiate a context.
 
@@ -450,7 +551,6 @@ class OpenCL(object):
         E.g.: If Nvidia driver is installed, GPU will succeed but CPU will fail.
               The AMD SDK kit is required for CPU via OpenCL.
         :param devicetype: string in ["cpu","gpu", "all", "acc"]
-        :param useFp64: boolean specifying if double precision will be used: deprecated use extensions=["cl_khr_fp64"]
         :param platformid: integer
         :param deviceid: integer
         :param cached: True if we want to cache the context
@@ -460,37 +560,60 @@ class OpenCL(object):
         """
         if extensions is None:
             extensions = []
-        if useFp64:
-            logger.warning("Deprecation: please select your device using the extension name!, i.e. extensions=['cl_khr_fp64']")
-            extensions.append('cl_khr_fp64')
 
+        ctx = None
         if (platformid is not None) and (deviceid is not None):
             platformid = int(platformid)
             deviceid = int(deviceid)
         elif "PYOPENCL_CTX" in os.environ:
-            pyopencl_ctx = [int(i) if i.isdigit() else 0 for i in os.environ["PYOPENCL_CTX"].split(":")]
-            pyopencl_ctx += [0] * (2 - len(pyopencl_ctx))  # pad with 0
-            platformid, deviceid = pyopencl_ctx
+            ctx = pyopencl.create_some_context()
+            # try:
+            device = ctx.devices[0]
+            platforms = [
+                i for i, p in enumerate(ocl.platforms) if device.platform.name == p.name
+            ]
+            if platforms:
+                platformid = platforms[0]
+                devices = [
+                    i
+                    for i, d in enumerate(ocl.platforms[platformid].devices)
+                    if device.name == d.name
+                ]
+                if devices:
+                    deviceid = devices[0]
+                    if cached:
+                        self.context_cache[(platformid, deviceid)] = ctx
         else:
             ids = ocl.select_device(type=devicetype, extensions=extensions)
             if ids:
                 platformid, deviceid = ids
-        ctx = None
-        if (platformid is not None) and (deviceid is not None):
+
+        if (ctx is None) and (platformid is not None) and (deviceid is not None):
             if (platformid, deviceid) in self.context_cache:
                 ctx = self.context_cache[(platformid, deviceid)]
             else:
                 try:
-                    ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
+                    ctx = pyopencl.Context(
+                        devices=[
+                            pyopencl.get_platforms()[platformid].get_devices()[deviceid]
+                        ]
+                    )
                 except pyopencl._cl.LogicError as error:
                     self.platforms[platformid].devices[deviceid].set_unavailable()
-                    logger.warning("Unable to create context on %s/%s: %s", platformid, deviceid, error)
+                    logger.warning(
+                        "Unable to create context on %s/%s: %s",
+                        platformid,
+                        deviceid,
+                        error,
+                    )
                     ctx = None
                 else:
                     if cached:
                         self.context_cache[(platformid, deviceid)] = ctx
         if ctx is None:
-            logger.warning("Last chance to get an OpenCL device ... probably not the one requested")
+            logger.warning(
+                "Last chance to get an OpenCL device ... probably not the one requested"
+            )
             ctx = pyopencl.create_some_context(interactive=False)
         return ctx
 
@@ -560,8 +683,9 @@ def allocate_cl_buffers(buffers, device=None, context=None):
     for _, _, dtype, size in buffers:
         ualloc += numpy.dtype(dtype).itemsize * size
     memory = device.memory
-    logger.info("%.3fMB are needed on device which has %.3fMB",
-                ualloc / 1.0e6, memory / 1.0e6)
+    logger.info(
+        "%.3fMB are needed on device which has %.3fMB", ualloc / 1.0e6, memory / 1.0e6
+    )
     if ualloc >= memory:
         memError = "Fatal error in allocate_buffers."
         memError += "Not enough device memory for buffers"
@@ -571,8 +695,9 @@ def allocate_cl_buffers(buffers, device=None, context=None):
     # do the allocation
     try:
         for name, flag, dtype, size in buffers:
-            mem[name] = pyopencl.Buffer(context, flag,
-                                        numpy.dtype(dtype).itemsize * size)
+            mem[name] = pyopencl.Buffer(
+                context, flag, numpy.dtype(dtype).itemsize * size
+            )
     except pyopencl.MemoryError as error:
         release_cl_buffers(mem)
         raise MemoryError(error)
@@ -589,16 +714,15 @@ def allocate_texture(ctx, shape, hostbuf=None, support_1D=False):
         do not support 1D images, so 1D images are handled as 2D with one row
     :param support_1D: force the image to be 1D if the shape has only one dim
     """
-    if len(shape) == 1 and not(support_1D):
+    if len(shape) == 1 and not (support_1D):
         shape = (1,) + shape
     return pyopencl.Image(
         ctx,
         pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.USE_HOST_PTR,
         pyopencl.ImageFormat(
-            pyopencl.channel_order.INTENSITY,
-            pyopencl.channel_type.FLOAT
+            pyopencl.channel_order.INTENSITY, pyopencl.channel_type.FLOAT
         ),
-        hostbuf=numpy.zeros(shape[::-1], dtype=numpy.float32)
+        hostbuf=numpy.zeros(shape[::-1], dtype=numpy.float32),
     )
 
 
@@ -620,7 +744,7 @@ def check_textures_availability(ctx):
     # There is no way to detect this until a kernel is compiled
     try:
         cc = ctx.devices[0].compute_capability_major_nv
-        textures_available &= (cc >= 3)
+        textures_available &= cc >= 3
     except (pyopencl.LogicError, AttributeError):  # probably not a Nvidia GPU
         pass
     #
@@ -660,7 +784,7 @@ def query_kernel_info(program, kernel, what="WORK_GROUP_SIZE"):
     :param kernel: kernel or name of the kernel
     :param what: what is the query about ?
     :return: int or 3-int for the workgroup size.
-    
+
     Possible information available are:
     * 'COMPILE_WORK_GROUP_SIZE': Returns the work-group size specified inside the kernel (__attribute__((reqd_work_gr oup_size(X, Y, Z))))
     * 'GLOBAL_WORK_SIZE': maximum global size that can be used to execute a kernel  #OCL2.1!
@@ -668,15 +792,17 @@ def query_kernel_info(program, kernel, what="WORK_GROUP_SIZE"):
     * 'PREFERRED_WORK_GROUP_SIZE_MULTIPLE': preferred multiple of workgroup size for launch. This is a performance hint.
     * 'PRIVATE_MEM_SIZE' Returns the minimum amount of private memory, in bytes, used by each workitem in the kernel
     * 'WORK_GROUP_SIZE': maximum work-group size that can be used to execute a kernel on a specific device given by device
-    
+
     Further information on:
     https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
-    
+
     """
     assert isinstance(program, pyopencl.Program)
     if not isinstance(kernel, pyopencl.Kernel):
         kernel_name = kernel
-        assert kernel in (k.function_name for k in program.all_kernels()), "the kernel exists"
+        assert kernel in (
+            k.function_name for k in program.all_kernels()
+        ), "the kernel exists"
         kernel = program.__getattr__(kernel_name)
 
     device = program.devices[0]
diff --git a/src/silx/opencl/conftest.py b/src/silx/opencl/conftest.py
index 1fdc516..f6cf5de 100644
--- a/src/silx/opencl/conftest.py
+++ b/src/silx/opencl/conftest.py
@@ -1,5 +1,6 @@
 import pytest
 
+
 @pytest.mark.usefixtures("use_opencl")
 def setup_module(module):
     pass
diff --git a/src/silx/opencl/convolution.py b/src/silx/opencl/convolution.py
index 15ef931..99ecd02 100644
--- a/src/silx/opencl/convolution.py
+++ b/src/silx/opencl/convolution.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
-# Copyright (c) 2019 European Synchrotron Radiation Facility
+# Copyright (c) 2019-2023 European Synchrotron Radiation Facility
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -25,27 +24,35 @@
 # ###########################################################################*/
 """Module for convolution on CPU/GPU."""
 
-from __future__ import absolute_import, print_function, with_statement, division
-
 __authors__ = ["P. Paleo"]
 __license__ = "MIT"
 __date__ = "01/08/2019"
 
 import numpy as np
-from copy import copy  # python2
 from .common import pyopencl as cl
 import pyopencl.array as parray
 from .processing import OpenclProcessing, EventDescription
 from .utils import ConvolutionInfos
 
+
 class Convolution(OpenclProcessing):
     """
     A class for performing convolution on CPU/GPU with OpenCL.
     """
 
-    def __init__(self, shape, kernel, axes=None, mode=None, ctx=None,
-                 devicetype="all", platformid=None, deviceid=None,
-                 profile=False, extra_options=None):
+    def __init__(
+        self,
+        shape,
+        kernel,
+        axes=None,
+        mode=None,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        profile=False,
+        extra_options=None,
+    ):
         """Constructor of OpenCL Convolution.
 
         :param shape: shape of the array.
@@ -73,9 +80,14 @@ class Convolution(OpenclProcessing):
             "allocate_tmp_array": True,
             "dont_use_textures": False,
         """
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  profile=profile)
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            profile=profile,
+        )
 
         self._configure_extra_options(extra_options)
         self._determine_use_case(shape, kernel, axes)
@@ -91,7 +103,7 @@ class Convolution(OpenclProcessing):
         }
         extra_opts = extra_options or {}
         self.extra_options.update(extra_opts)
-        self.use_textures = not(self.extra_options["dont_use_textures"])
+        self.use_textures = not (self.extra_options["dont_use_textures"])
         self.use_textures &= self.check_textures_availability()
 
     def _get_dimensions(self, shape, kernel):
@@ -136,8 +148,7 @@ class Convolution(OpenclProcessing):
             if axes in convol_infos.allowed_axes[uc_name]:
                 self.use_case_name = uc_name
                 self.use_case_desc = uc_params["name"]
-                #~ self.use_case_kernels = uc_params["kernels"].copy()
-                self.use_case_kernels = copy(uc_params["kernels"]) # TODO use the above line once we get rid of python2
+                self.use_case_kernels = uc_params["kernels"].copy()
         if self.use_case_name is None:
             raise ValueError(
                 "Cannot find a use case for data ndim = %d, kernel ndim = %d and axes=%s"
@@ -146,8 +157,7 @@ class Convolution(OpenclProcessing):
         # TODO implement this use case
         if self.use_case_name == "batched_separable_2D_1D_3D":
             raise NotImplementedError(
-                "The use case %s is not implemented"
-                % self.use_case_name
+                "The use case %s is not implemented" % self.use_case_name
             )
         #
         self.axes = axes
@@ -171,7 +181,7 @@ class Convolution(OpenclProcessing):
             "allocate_tmp_array": "data_tmp",
         }
         # Nonseparable transforms do not need tmp array
-        if not(self.separable):
+        if not (self.separable):
             self.extra_options["allocate_tmp_array"] = False
         # Allocate arrays
         for option_name, array_name in option_array_names.items():
@@ -185,7 +195,7 @@ class Convolution(OpenclProcessing):
         if isinstance(self.kernel, np.ndarray):
             self.d_kernel = parray.to_device(self.queue, self.kernel)
         else:
-            if not(isinstance(self.kernel, parray.Array)):
+            if not (isinstance(self.kernel, parray.Array)):
                 raise ValueError("kernel must be either numpy array or pyopencl array")
             self.d_kernel = self.kernel
         self._old_input_ref = None
@@ -210,7 +220,7 @@ class Convolution(OpenclProcessing):
                 % (self.mode, str(mp.keys()))
             )
         # TODO
-        if not(self.use_textures) and self.mode.lower() == "constant":
+        if not (self.use_textures) and self.mode.lower() == "constant":
             raise NotImplementedError(
                 "mode='constant' is not implemented without textures yet"
             )
@@ -231,28 +241,30 @@ class Convolution(OpenclProcessing):
         compile_options = [str("-DUSED_CONV_MODE=%d" % self._c_conv_mode)]
         if self.use_textures:
             kernel_files = ["convolution_textures.cl"]
-            compile_options.extend([
-                str("-DIMAGE_DIMS=%d" % self.data_ndim),
-                str("-DFILTER_DIMS=%d" % self.kernel_ndim),
-            ])
+            compile_options.extend(
+                [
+                    str("-DIMAGE_DIMS=%d" % self.data_ndim),
+                    str("-DFILTER_DIMS=%d" % self.kernel_ndim),
+                ]
+            )
             d_kernel_ref = self.d_kernel_tex
         else:
             kernel_files = ["convolution.cl"]
             d_kernel_ref = self.d_kernel.data
-        self.compile_kernels(
-            kernel_files=kernel_files,
-            compile_options=compile_options
-        )
+        self.compile_kernels(kernel_files=kernel_files, compile_options=compile_options)
         self.ndrange = self.shape[::-1]
         self.wg = None
         kernel_args = [
             self.queue,
-            self.ndrange, self.wg,
+            self.ndrange,
+            self.wg,
             None,
             None,
             d_kernel_ref,
             np.int32(self.kernel.shape[0]),
-            self.Nx, self.Ny, self.Nz
+            self.Nx,
+            self.Ny,
+            self.Nz,
         ]
         if self.kernel_ndim == 2:
             kernel_args.insert(6, np.int32(self.kernel.shape[1]))
@@ -266,10 +278,7 @@ class Convolution(OpenclProcessing):
         if self.separable:
             if self.data_tmp is not None:
                 self.swap_pattern = {
-                    2: [
-                        ("data_in", "data_tmp"),
-                        ("data_tmp", "data_out")
-                    ],
+                    2: [("data_in", "data_tmp"), ("data_tmp", "data_out")],
                     3: [
                         ("data_in", "data_out"),
                         ("data_out", "data_tmp"),
@@ -325,14 +334,14 @@ class Convolution(OpenclProcessing):
         else:
             raise ValueError("Please provide either arr= or shape=")
         if ndim < dim_min or ndim > dim_max:
-            raise ValueError("%s dimensions should be between %d and %d"
-                % (name, dim_min, dim_max)
+            raise ValueError(
+                "%s dimensions should be between %d and %d" % (name, dim_min, dim_max)
             )
         return ndim
 
     def _check_array(self, arr):
         # TODO allow cl.Buffer
-        if not(isinstance(arr, parray.Array) or isinstance(arr, np.ndarray)):
+        if not (isinstance(arr, parray.Array) or isinstance(arr, np.ndarray)):
             raise TypeError("Expected either pyopencl.array.Array or numpy.ndarray")
         # TODO composition with ImageProcessing/cast
         if arr.dtype != np.float32:
@@ -354,14 +363,12 @@ class Convolution(OpenclProcessing):
                 self.data_in = array
             data_in_ref = self.data_in
         if output is not None:
-            if not(isinstance(output, np.ndarray)):
+            if not (isinstance(output, np.ndarray)):
                 self._old_output_ref = self.data_out
                 self.data_out = output
         # Update OpenCL kernel arguments with new array references
         self.kernel_args = self._configure_kernel_args(
-            self.kernel_args,
-            data_in_ref,
-            self.data_out
+            self.kernel_args, data_in_ref, self.data_out
         )
 
     def _separable_convolution(self):
@@ -375,9 +382,7 @@ class Convolution(OpenclProcessing):
         # Batched: one kernel call in total
         opencl_kernel = self.kernels.get_kernel(self.use_case_kernels[axis])
         opencl_kernel_args = self._configure_kernel_args(
-            self.kernel_args,
-            input_ref,
-            output_ref
+            self.kernel_args, input_ref, output_ref
         )
         ev = opencl_kernel(*opencl_kernel_args)
         if self.profile:
@@ -398,9 +403,7 @@ class Convolution(OpenclProcessing):
             self.data_out = self._old_output_ref
             self._old_output_ref = None
         self.kernel_args = self._configure_kernel_args(
-            self.kernel_args,
-            self.data_in,
-            self.data_out
+            self.kernel_args, self.data_in, self.data_out
         )
 
     def _get_output(self, output):
@@ -436,7 +439,4 @@ class Convolution(OpenclProcessing):
         res = self._get_output(output)
         return res
 
-
     __call__ = convolve
-
-
diff --git a/src/silx/opencl/image.py b/src/silx/opencl/image.py
index 65e2d5e..ec30e66 100644
--- a/src/silx/opencl/image.py
+++ b/src/silx/opencl/image.py
@@ -1,9 +1,8 @@
-# -*- coding: utf-8 -*-
 #
 #    Project: silx
 #             https://github.com/silx-kit/silx
 #
-#    Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France
+#    Copyright (C) 2012-2023 European Synchrotron Radiation Facility, Grenoble, France
 #
 #    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
 #
@@ -28,8 +27,6 @@
 """A general purpose library for manipulating 2D images in 1 or 3 colors 
 
 """
-from __future__ import absolute_import, print_function, with_statement, division
-
 
 __author__ = "Jerome Kieffer"
 __license__ = "MIT"
@@ -40,7 +37,6 @@ __contact__ = "jerome.kieffer@esrf.fr"
 import os
 import logging
 import numpy
-from collections import OrderedDict
 from math import floor, ceil, sqrt, log
 
 from .common import pyopencl, kernel_workgroup_size
@@ -52,20 +48,30 @@ logger = logging.getLogger(__name__)
 
 
 class ImageProcessing(OpenclProcessing):
-
     kernel_files = ["cast", "map", "max_min", "histogram"]
 
-    converter = {numpy.dtype(numpy.uint8): "u8_to_float",
-                 numpy.dtype(numpy.int8): "s8_to_float",
-                 numpy.dtype(numpy.uint16): "u16_to_float",
-                 numpy.dtype(numpy.int16): "s16_to_float",
-                 numpy.dtype(numpy.uint32): "u32_to_float",
-                 numpy.dtype(numpy.int32): "s32_to_float",
-                 }
-
-    def __init__(self, shape=None, ncolors=1, template=None,
-                 ctx=None, devicetype="all", platformid=None, deviceid=None,
-                 block_size=None, memory=None, profile=False):
+    converter = {
+        numpy.dtype(numpy.uint8): "u8_to_float",
+        numpy.dtype(numpy.int8): "s8_to_float",
+        numpy.dtype(numpy.uint16): "u16_to_float",
+        numpy.dtype(numpy.int16): "s16_to_float",
+        numpy.dtype(numpy.uint32): "u32_to_float",
+        numpy.dtype(numpy.int32): "s32_to_float",
+    }
+
+    def __init__(
+        self,
+        shape=None,
+        ncolors=1,
+        template=None,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        block_size=None,
+        memory=None,
+        profile=False,
+    ):
         """Constructor of the ImageProcessing class
 
         :param ctx: actual working context, left to None for automatic
@@ -79,9 +85,16 @@ class ImageProcessing(OpenclProcessing):
         :param profile: switch on profiling to be able to profile at the kernel
                          level, store profiling elements (makes code slightly slower)
         """
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  block_size=block_size, memory=memory, profile=profile)
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            block_size=block_size,
+            memory=memory,
+            profile=profile,
+        )
         if template is not None:
             shape = template.shape
             if len(shape) > 2:
@@ -94,36 +107,51 @@ class ImageProcessing(OpenclProcessing):
             self.ncolors = ncolors
             self.shape = shape
             assert shape is not None
-        self.buffer_shape = self.shape if self.ncolors == 1 else self.shape + (self.ncolors,)
+        self.buffer_shape = (
+            self.shape if self.ncolors == 1 else self.shape + (self.ncolors,)
+        )
         kernel_files = [os.path.join("image", i) for i in self.kernel_files]
-        self.compile_kernels(kernel_files,
-                             compile_options="-DNB_COLOR=%i" % self.ncolors)
+        self.compile_kernels(
+            kernel_files, compile_options="-DNB_COLOR=%i" % self.ncolors
+        )
         if self.ncolors == 1:
             img_shape = self.shape
         else:
             img_shape = self.shape + (self.ncolors,)
 
-        buffers = [BufferDescription("image0_d", img_shape, numpy.float32, None),
-                   BufferDescription("image1_d", img_shape, numpy.float32, None),
-                   BufferDescription("image2_d", img_shape, numpy.float32, None),
-                   BufferDescription("max_min_d", 2, numpy.float32, None),
-                   BufferDescription("cnt_d", 1, numpy.int32, None), ]
+        buffers = [
+            BufferDescription("image0_d", img_shape, numpy.float32, None),
+            BufferDescription("image1_d", img_shape, numpy.float32, None),
+            BufferDescription("image2_d", img_shape, numpy.float32, None),
+            BufferDescription("max_min_d", 2, numpy.float32, None),
+            BufferDescription("cnt_d", 1, numpy.int32, None),
+        ]
         # Temporary buffer for max-min reduction
-        self.wg_red = kernel_workgroup_size(self.program, self.kernels.max_min_reduction_stage1)
+        self.wg_red = kernel_workgroup_size(
+            self.program, self.kernels.max_min_reduction_stage1
+        )
         if self.wg_red > 1:
-            self.wg_red = min(self.wg_red,
-                              numpy.int32(1 << int(floor(log(sqrt(numpy.prod(self.shape)), 2)))))
-            tmp = BufferDescription("tmp_max_min_d", 2 * self.wg_red, numpy.float32, None)
+            self.wg_red = min(
+                self.wg_red,
+                numpy.int32(1 << int(floor(log(sqrt(numpy.prod(self.shape)), 2)))),
+            )
+            tmp = BufferDescription(
+                "tmp_max_min_d", 2 * self.wg_red, numpy.float32, None
+            )
             buffers.append(tmp)
         self.allocate_buffers(buffers, use_array=True)
         self.cl_mem["cnt_d"].fill(0)
 
     def __repr__(self):
-        return "ImageProcessing for shape=%s, %i colors initalized on %s" % \
-            (self.shape, self.ncolors, self.ctx.devices[0].name)
-
-    def _get_in_out_buffers(self, img=None, copy=True, out=None,
-                            out_dtype=None, out_size=None):
+        return "ImageProcessing for shape=%s, %i colors initalized on %s" % (
+            self.shape,
+            self.ncolors,
+            self.ctx.devices[0].name,
+        )
+
+    def _get_in_out_buffers(
+        self, img=None, copy=True, out=None, out_dtype=None, out_size=None
+    ):
         """Internal method used to select the proper buffers before processing.
 
         :param img: expects a numpy array or a pyopencl.array of dim 2 or 3
@@ -132,7 +160,7 @@ class ImageProcessing(OpenclProcessing):
         :param out_dtype: enforce the type of the output buffer (optional)
         :param out_size: enforce the size of the output buffer (optional)
         :return: input_buffer, output_buffer
-        
+
         Nota: this is not locked.
         """
         events = []
@@ -151,7 +179,9 @@ class ImageProcessing(OpenclProcessing):
             if out_dtype != numpy.float32 and out_size:
                 name = "%s_%s_d" % (numpy.dtype(out_dtype), out_size)
                 if name not in self.cl_mem:
-                    output_array = self.cl_mem[name] = pyopencl.array.empty(self.queue, (out_size,), out_dtype)
+                    output_array = self.cl_mem[name] = pyopencl.array.empty(
+                        self.queue, (out_size,), out_dtype
+                    )
                 else:
                     output_array = self.cl_mem[name]
             else:
@@ -161,7 +191,9 @@ class ImageProcessing(OpenclProcessing):
             input_array = self.cl_mem["image1_d"]
         if isinstance(img, pyopencl.array.Array):
             if copy:
-                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, img.data)
+                evt = pyopencl.enqueue_copy(
+                    self.queue, self.cl_mem["image1_d"].data, img.data
+                )
                 input_array = self.cl_mem["image1_d"]
                 events.append(EventDescription("copy D->D", evt))
             else:
@@ -172,11 +204,19 @@ class ImageProcessing(OpenclProcessing):
             # assume this is numpy
             if img.dtype.itemsize > 4:
                 logger.warning("Casting to float32 on CPU")
-                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, numpy.ascontiguousarray(img, numpy.float32))
+                evt = pyopencl.enqueue_copy(
+                    self.queue,
+                    self.cl_mem["image1_d"].data,
+                    numpy.ascontiguousarray(img, numpy.float32),
+                )
                 input_array = self.cl_mem["image1_d"]
                 events.append(EventDescription("cast+copy H->D", evt))
             else:
-                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, numpy.ascontiguousarray(img))
+                evt = pyopencl.enqueue_copy(
+                    self.queue,
+                    self.cl_mem["image1_d"].data,
+                    numpy.ascontiguousarray(img),
+                )
                 input_array = self.cl_mem["image1_d"]
                 events.append(EventDescription("copy H->D", evt))
         if self.profile:
@@ -184,8 +224,8 @@ class ImageProcessing(OpenclProcessing):
         return input_array, output_array
 
     def to_float(self, img, copy=True, out=None):
-        """ Takes any array and convert it to a float array for ease of processing.
-        
+        """Takes any array and convert it to a float array for ease of processing.
+
         :param img: expects a numpy array or a pyopencl.array of dim 2 or 3
         :param copy: set to False to directly re-use a pyopencl array
         :param out: provide an output buffer to store the result
@@ -197,16 +237,23 @@ class ImageProcessing(OpenclProcessing):
             input_array, output_array = self._get_in_out_buffers(img, copy, out)
             if (img.dtype.itemsize > 4) or (img.dtype == numpy.float32):
                 # copy device -> device, already there as float32
-                ev = pyopencl.enqueue_copy(self.queue, output_array.data, input_array.data)
+                ev = pyopencl.enqueue_copy(
+                    self.queue, output_array.data, input_array.data
+                )
                 events.append(EventDescription("copy D->D", ev))
             else:
                 # Cast to float:
                 name = self.converter[img.dtype]
                 kernel = self.kernels.get_kernel(name)
-                ev = kernel(self.queue, (self.shape[1], self.shape[0]), None,
-                            input_array.data, output_array.data,
-                            numpy.int32(self.shape[1]), numpy.int32(self.shape[0])
-                            )
+                ev = kernel(
+                    self.queue,
+                    (self.shape[1], self.shape[0]),
+                    None,
+                    input_array.data,
+                    output_array.data,
+                    numpy.int32(self.shape[1]),
+                    numpy.int32(self.shape[0]),
+                )
                 events.append(EventDescription("cast %s" % name, ev))
 
         if self.profile:
@@ -221,14 +268,14 @@ class ImageProcessing(OpenclProcessing):
     def normalize(self, img, mini=0.0, maxi=1.0, copy=True, out=None):
         """Scale the intensity of the image so that the minimum is 0 and the
         maximum is 1.0 (or any value suggested).
-        
+
         :param img: numpy array or pyopencl array of dim 2 or 3 and of type float
         :param mini: Expected minimum value
         :param maxi: expected maxiumum value
         :param copy: set to False to use directly the input buffer
         :param out: provides an output buffer. prevents a copy D->H
-        
-        This uses a min/max reduction in two stages plus a map operation  
+
+        This uses a min/max reduction in two stages plus a map operation
         """
         assert img.shape == self.buffer_shape
         events = []
@@ -238,34 +285,55 @@ class ImageProcessing(OpenclProcessing):
             if self.wg_red == 1:
                 #  Probably on MacOS CPU WG==1 --> serial code.
                 kernel = self.kernels.get_kernel("max_min_serial")
-                evt = kernel(self.queue, (1,), (1,),
-                             input_array.data,
-                             size,
-                             self.cl_mem["max_min_d"].data)
+                evt = kernel(
+                    self.queue,
+                    (1,),
+                    (1,),
+                    input_array.data,
+                    size,
+                    self.cl_mem["max_min_d"].data,
+                )
                 ed = EventDescription("max_min_serial", evt)
                 events.append(ed)
             else:
                 stage1 = self.kernels.max_min_reduction_stage1
                 stage2 = self.kernels.max_min_reduction_stage2
                 local_mem = pyopencl.LocalMemory(int(self.wg_red * 8))
-                k1 = stage1(self.queue, (int(self.wg_red ** 2),), (int(self.wg_red),),
-                            input_array.data,
-                            self.cl_mem["tmp_max_min_d"].data,
-                            size,
-                            local_mem)
-                k2 = stage2(self.queue, (int(self.wg_red),), (int(self.wg_red),),
-                            self.cl_mem["tmp_max_min_d"].data,
-                            self.cl_mem["max_min_d"].data,
-                            local_mem)
-
-                events += [EventDescription("max_min_stage1", k1),
-                           EventDescription("max_min_stage2", k2)]
-
-            evt = self.kernels.normalize_image(self.queue, (self.shape[1], self.shape[0]), None,
-                                               input_array.data, output_array.data,
-                                               numpy.int32(self.shape[1]), numpy.int32(self.shape[0]),
-                                               self.cl_mem["max_min_d"].data,
-                                               numpy.float32(mini), numpy.float32(maxi))
+                k1 = stage1(
+                    self.queue,
+                    (int(self.wg_red**2),),
+                    (int(self.wg_red),),
+                    input_array.data,
+                    self.cl_mem["tmp_max_min_d"].data,
+                    size,
+                    local_mem,
+                )
+                k2 = stage2(
+                    self.queue,
+                    (int(self.wg_red),),
+                    (int(self.wg_red),),
+                    self.cl_mem["tmp_max_min_d"].data,
+                    self.cl_mem["max_min_d"].data,
+                    local_mem,
+                )
+
+                events += [
+                    EventDescription("max_min_stage1", k1),
+                    EventDescription("max_min_stage2", k2),
+                ]
+
+            evt = self.kernels.normalize_image(
+                self.queue,
+                (self.shape[1], self.shape[0]),
+                None,
+                input_array.data,
+                output_array.data,
+                numpy.int32(self.shape[1]),
+                numpy.int32(self.shape[0]),
+                self.cl_mem["max_min_d"].data,
+                numpy.float32(mini),
+                numpy.float32(maxi),
+            )
             events.append(EventDescription("normalize", evt))
         if self.profile:
             self.events += events
@@ -277,32 +345,32 @@ class ImageProcessing(OpenclProcessing):
             output_array.finish()
             return output_array
 
-    def histogram(self, img=None, nbins=255, range=None,
-                  log_scale=False, copy=True, out=None):
+    def histogram(
+        self, img=None, nbins=255, range=None, log_scale=False, copy=True, out=None
+    ):
         """Compute the histogram of a set of data.
-        
+
         :param img: input image. If None, use the one already on the device
         :param nbins: number of bins
-        :param range: the lower and upper range of the bins.  If not provided, 
-                    range is simply ``(a.min(), a.max())``.  Values outside the 
-                    range are ignored. The first element of the range must be 
+        :param range: the lower and upper range of the bins.  If not provided,
+                    range is simply ``(a.min(), a.max())``.  Values outside the
+                    range are ignored. The first element of the range must be
                     less than or equal to the second.
-        :param log_scale: perform the binning in lograrithmic scale. 
+        :param log_scale: perform the binning in lograrithmic scale.
                          Open to extension
         :param copy: unset to directly use the input buffer without copy
-        :param out: use a provided array for offering the result 
+        :param out: use a provided array for offering the result
         :return: histogram (size=nbins), edges (size=nbins+1)
-        API similar to numpy  
+        API similar to numpy
         """
         assert img.shape == self.buffer_shape
 
         input_array = self.to_float(img, copy=copy, out=self.cl_mem["image0_d"])
         events = []
         with self.sem:
-            input_array, output_array = self._get_in_out_buffers(input_array, copy=False,
-                                                                 out=out,
-                                                                 out_dtype=numpy.int32,
-                                                                 out_size=nbins)
+            input_array, output_array = self._get_in_out_buffers(
+                input_array, copy=False, out=out, out_dtype=numpy.int32, out_size=nbins
+            )
 
             if range is None:
                 # measure actually the bounds
@@ -311,27 +379,43 @@ class ImageProcessing(OpenclProcessing):
                     #  Probably on MacOS CPU WG==1 --> serial code.
                     kernel = self.kernels.get_kernel("max_min_serial")
 
-                    evt = kernel(self.queue, (1,), (1,),
-                                 input_array.data,
-                                 size,
-                                 self.cl_mem["max_min_d"].data)
+                    evt = kernel(
+                        self.queue,
+                        (1,),
+                        (1,),
+                        input_array.data,
+                        size,
+                        self.cl_mem["max_min_d"].data,
+                    )
                     events.append(EventDescription("max_min_serial", evt))
                 else:
                     stage1 = self.kernels.max_min_reduction_stage1
                     stage2 = self.kernels.max_min_reduction_stage2
-                    local_mem = pyopencl.LocalMemory(int(self.wg_red * 2 * numpy.dtype("float32").itemsize))
-                    k1 = stage1(self.queue, (int(self.wg_red ** 2),), (int(self.wg_red),),
-                                input_array.data,
-                                self.cl_mem["tmp_max_min_d"].data,
-                                size,
-                                local_mem)
-                    k2 = stage2(self.queue, (int(self.wg_red),), (int(self.wg_red),),
-                                self.cl_mem["tmp_max_min_d"].data,
-                                self.cl_mem["max_min_d"].data,
-                                local_mem)
-
-                    events += [EventDescription("max_min_stage1", k1),
-                               EventDescription("max_min_stage2", k2)]
+                    local_mem = pyopencl.LocalMemory(
+                        int(self.wg_red * 2 * numpy.dtype("float32").itemsize)
+                    )
+                    k1 = stage1(
+                        self.queue,
+                        (int(self.wg_red**2),),
+                        (int(self.wg_red),),
+                        input_array.data,
+                        self.cl_mem["tmp_max_min_d"].data,
+                        size,
+                        local_mem,
+                    )
+                    k2 = stage2(
+                        self.queue,
+                        (int(self.wg_red),),
+                        (int(self.wg_red),),
+                        self.cl_mem["tmp_max_min_d"].data,
+                        self.cl_mem["max_min_d"].data,
+                        local_mem,
+                    )
+
+                    events += [
+                        EventDescription("max_min_stage1", k1),
+                        EventDescription("max_min_stage2", k2),
+                    ]
                 maxi, mini = self.cl_mem["max_min_d"].get()
             else:
                 mini = numpy.float32(min(range))
@@ -341,13 +425,17 @@ class ImageProcessing(OpenclProcessing):
             tmp_size = nb_engines * nbins
             name = "tmp_int32_%s_d" % (tmp_size)
             if name not in self.cl_mem:
-                tmp_array = self.cl_mem[name] = pyopencl.array.empty(self.queue, (tmp_size,), numpy.int32)
+                tmp_array = self.cl_mem[name] = pyopencl.array.empty(
+                    self.queue, (tmp_size,), numpy.int32
+                )
             else:
                 tmp_array = self.cl_mem[name]
 
             edge_name = "tmp_float32_%s_d" % (nbins + 1)
             if edge_name not in self.cl_mem:
-                edges_array = self.cl_mem[edge_name] = pyopencl.array.empty(self.queue, (nbins + 1,), numpy.float32)
+                edges_array = self.cl_mem[edge_name] = pyopencl.array.empty(
+                    self.queue, (nbins + 1,), numpy.float32
+                )
             else:
                 edges_array = self.cl_mem[edge_name]
 
@@ -359,21 +447,27 @@ class ImageProcessing(OpenclProcessing):
             else:
                 map_operation = numpy.int32(0)
             kernel = self.kernels.get_kernel("histogram")
-            wg = min(device.max_work_group_size,
-                     1 << (int(ceil(log(nbins, 2)))),
-                     self.kernels.max_workgroup_size(kernel))
-            evt = kernel(self.queue, (wg * nb_engines,), (wg,),
-                         input_array.data,
-                         numpy.int32(input_array.size),
-                         mini,
-                         maxi,
-                         map_operation,
-                         output_array.data,
-                         edges_array.data,
-                         numpy.int32(nbins),
-                         tmp_array.data,
-                         self.cl_mem["cnt_d"].data,
-                         shared)
+            wg = min(
+                device.max_work_group_size,
+                1 << (int(ceil(log(nbins, 2)))),
+                self.kernels.max_workgroup_size(kernel),
+            )
+            evt = kernel(
+                self.queue,
+                (wg * nb_engines,),
+                (wg,),
+                input_array.data,
+                numpy.int32(input_array.size),
+                mini,
+                maxi,
+                map_operation,
+                output_array.data,
+                edges_array.data,
+                numpy.int32(nbins),
+                tmp_array.data,
+                self.cl_mem["cnt_d"].data,
+                shared,
+            )
             events.append(EventDescription("histogram", evt))
 
         if self.profile:
diff --git a/src/silx/opencl/linalg.py b/src/silx/opencl/linalg.py
index a64122a..573ebce 100644
--- a/src/silx/opencl/linalg.py
+++ b/src/silx/opencl/linalg.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
 # Copyright (c) 2016 European Synchrotron Radiation Facility
@@ -25,8 +24,6 @@
 # ###########################################################################*/
 """Module for basic linear algebra in OpenCL"""
 
-from __future__ import absolute_import, print_function, with_statement, division
-
 __authors__ = ["P. Paleo"]
 __license__ = "MIT"
 __date__ = "01/08/2019"
@@ -37,14 +34,23 @@ from .common import pyopencl
 from .processing import EventDescription, OpenclProcessing
 
 import pyopencl.array as parray
+
 cl = pyopencl
 
 
 class LinAlg(OpenclProcessing):
-
     kernel_files = ["linalg.cl"]
 
-    def __init__(self, shape, do_checks=False, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False):
+    def __init__(
+        self,
+        shape,
+        do_checks=False,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        profile=False,
+    ):
         """
         Create a "Linear Algebra" plan for a given image shape.
 
@@ -59,32 +65,34 @@ class LinAlg(OpenclProcessing):
                         store profiling elements (makes code slightly slower)
 
         """
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  profile=profile)
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            profile=profile,
+        )
 
         self.d_gradient = parray.empty(self.queue, shape, np.complex64)
         self.d_gradient.fill(np.complex64(0.0))
         self.d_image = parray.empty(self.queue, shape, np.float32)
         self.d_image.fill(np.float32(0.0))
-        self.add_to_cl_mem({
-            "d_gradient": self.d_gradient,
-            "d_image": self.d_image
-        })
+        self.add_to_cl_mem({"d_gradient": self.d_gradient, "d_image": self.d_image})
 
         self.wg2D = None
         self.shape = shape
-        self.ndrange2D = (
-            int(self.shape[1]),
-            int(self.shape[0])
-        )
+        self.ndrange2D = (int(self.shape[1]), int(self.shape[0]))
         self.do_checks = bool(do_checks)
         OpenclProcessing.compile_kernels(self, self.kernel_files)
 
     @staticmethod
     def check_array(array, dtype, shape, arg_name):
         if array.shape != shape or array.dtype != dtype:
-            raise ValueError("%s should be a %s array of type %s" %(arg_name, str(shape), str(dtype)))
+            raise ValueError(
+                "%s should be a %s array of type %s"
+                % (arg_name, str(shape), str(dtype))
+            )
 
     def get_data_references(self, src, dst, default_src_ref, default_dst_ref):
         """
@@ -100,7 +108,9 @@ class LinAlg(OpenclProcessing):
             elif isinstance(dst, cl.Buffer):
                 dst_ref = dst
             else:
-                raise ValueError("dst should be either pyopencl.array.Array or pyopencl.Buffer")
+                raise ValueError(
+                    "dst should be either pyopencl.array.Array or pyopencl.Buffer"
+                )
         else:
             dst_ref = default_dst_ref
 
@@ -130,21 +140,15 @@ class LinAlg(OpenclProcessing):
             self.check_array(image, np.float32, self.shape, "image")
             if dst is not None:
                 self.check_array(dst, np.complex64, self.shape, "dst")
-        img_ref, grad_ref = self.get_data_references(image, dst, self.d_image.data, self.d_gradient.data)
+        img_ref, grad_ref = self.get_data_references(
+            image, dst, self.d_image.data, self.d_gradient.data
+        )
 
         # Prepare the kernel call
-        kernel_args = [
-            img_ref,
-            grad_ref,
-            n_x,
-            n_y
-        ]
+        kernel_args = [img_ref, grad_ref, n_x, n_y]
         # Call the gradient kernel
         evt = self.kernels.kern_gradient2D(
-            self.queue,
-            self.ndrange2D,
-            self.wg2D,
-            *kernel_args
+            self.queue, self.ndrange2D, self.wg2D, *kernel_args
         )
         self.events.append(EventDescription("gradient2D", evt))
         # TODO: should the wait be done in any case ?
@@ -187,21 +191,15 @@ class LinAlg(OpenclProcessing):
             self.check_array(gradient, np.complex64, self.shape, "gradient")
             if dst is not None:
                 self.check_array(dst, np.float32, self.shape, "dst")
-        grad_ref, img_ref = self.get_data_references(gradient, dst, self.d_gradient.data, self.d_image.data)
+        grad_ref, img_ref = self.get_data_references(
+            gradient, dst, self.d_gradient.data, self.d_image.data
+        )
 
         # Prepare the kernel call
-        kernel_args = [
-            grad_ref,
-            img_ref,
-            n_x,
-            n_y
-        ]
+        kernel_args = [grad_ref, img_ref, n_x, n_y]
         # Call the gradient kernel
         evt = self.kernels.kern_divergence2D(
-            self.queue,
-            self.ndrange2D,
-            self.wg2D,
-            *kernel_args
+            self.queue, self.ndrange2D, self.wg2D, *kernel_args
         )
         self.events.append(EventDescription("divergence2D", evt))
         # TODO: should the wait be done in any case ?
diff --git a/src/silx/opencl/medfilt.py b/src/silx/opencl/medfilt.py
index d4e425b..a18c5a4 100644
--- a/src/silx/opencl/medfilt.py
+++ b/src/silx/opencl/medfilt.py
@@ -1,9 +1,8 @@
-# -*- coding: utf-8 -*-
 #
 #    Project: Azimuthal integration
 #             https://github.com/silx-kit/pyFAI
 #
-#    Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France
+#    Copyright (C) 2012-2023 European Synchrotron Radiation Facility, Grenoble, France
 #
 #    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
 #
@@ -31,8 +30,6 @@ The target is to mimic the signature of scipy.signal.medfilt and scipy.medfilt2
 
 The first implementation targets 2D implementation where this operation is costly (~10s/2kx2k image)
 """
-from __future__ import absolute_import, print_function, with_statement, division
-
 
 __author__ = "Jerome Kieffer"
 __license__ = "MIT"
@@ -42,7 +39,6 @@ __contact__ = "jerome.kieffer@esrf.fr"
 
 import logging
 import numpy
-from collections import OrderedDict
 
 from .common import pyopencl, kernel_workgroup_size
 from .processing import EventDescription, OpenclProcessing, BufferDescription
@@ -56,23 +52,33 @@ logger = logging.getLogger(__name__)
 
 class MedianFilter2D(OpenclProcessing):
     """A class for doing median filtering using OpenCL"""
+
     buffers = [
-               BufferDescription("result", 1, numpy.float32, mf.WRITE_ONLY),
-               BufferDescription("image_raw", 1, numpy.float32, mf.READ_ONLY),
-               BufferDescription("image", 1, numpy.float32, mf.READ_WRITE),
-               ]
+        BufferDescription("result", 1, numpy.float32, mf.WRITE_ONLY),
+        BufferDescription("image_raw", 1, numpy.float32, mf.READ_ONLY),
+        BufferDescription("image", 1, numpy.float32, mf.READ_WRITE),
+    ]
     kernel_files = ["preprocess.cl", "bitonic.cl", "medfilt.cl"]
-    mapping = {numpy.int8: "s8_to_float",
-               numpy.uint8: "u8_to_float",
-               numpy.int16: "s16_to_float",
-               numpy.uint16: "u16_to_float",
-               numpy.uint32: "u32_to_float",
-               numpy.int32: "s32_to_float"}
-
-    def __init__(self, shape, kernel_size=(3, 3),
-                 ctx=None, devicetype="all", platformid=None, deviceid=None,
-                 block_size=None, profile=False
-                 ):
+    mapping = {
+        numpy.int8: "s8_to_float",
+        numpy.uint8: "u8_to_float",
+        numpy.int16: "s16_to_float",
+        numpy.uint16: "u16_to_float",
+        numpy.uint32: "u32_to_float",
+        numpy.int32: "s32_to_float",
+    }
+
+    def __init__(
+        self,
+        shape,
+        kernel_size=(3, 3),
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        block_size=None,
+        profile=False,
+    ):
         """Constructor of the OpenCL 2D median filtering class
 
         :param shape: shape of the images to treat
@@ -86,34 +92,56 @@ class MedianFilter2D(OpenclProcessing):
         :param profile: switch on profiling to be able to profile at the kernel level,
                         store profiling elements (makes code slightly slower)
         """
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  block_size=block_size, profile=profile)
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            block_size=block_size,
+            profile=profile,
+        )
         self.shape = shape
         self.size = self.shape[0] * self.shape[1]
         self.kernel_size = self.calc_kernel_size(kernel_size)
         self.workgroup_size = (self.calc_wg(self.kernel_size), 1)  # 3D kernel
-        self.buffers = [BufferDescription(i.name, i.size * self.size, i.dtype, i.flags)
-                        for i in self.__class__.buffers]
+        self.buffers = [
+            BufferDescription(i.name, i.size * self.size, i.dtype, i.flags)
+            for i in self.__class__.buffers
+        ]
 
         self.allocate_buffers()
         self.local_mem = self._get_local_mem(self.workgroup_size[0])
-        OpenclProcessing.compile_kernels(self, self.kernel_files, "-D NIMAGE=%i" % self.size)
+        OpenclProcessing.compile_kernels(
+            self, self.kernel_files, "-D NIMAGE=%i" % self.size
+        )
         self.set_kernel_arguments()
 
     def set_kernel_arguments(self):
-        """Parametrize all kernel arguments
-        """
+        """Parametrize all kernel arguments"""
         for val in self.mapping.values():
-            self.cl_kernel_args[val] = OrderedDict(((i, self.cl_mem[i]) for i in ("image_raw", "image")))
-        self.cl_kernel_args["medfilt2d"] = OrderedDict((("image", self.cl_mem["image"]),
-                                                        ("result", self.cl_mem["result"]),
-                                                        ("local", self.local_mem),
-                                                        ("khs1", numpy.int32(self.kernel_size[0] // 2)),  # Kernel half-size along dim1 (lines)
-                                                        ("khs2", numpy.int32(self.kernel_size[1] // 2)),  # Kernel half-size along dim2 (columns)
-                                                        ("height", numpy.int32(self.shape[0])),  # Image size along dim1 (lines)
-                                                        ("width", numpy.int32(self.shape[1]))))
-#                                                         ('debug', self.cl_mem["debug"])))  # Image size along dim2 (columns))
+            self.cl_kernel_args[val] = dict(
+                ((i, self.cl_mem[i]) for i in ("image_raw", "image"))
+            )
+        self.cl_kernel_args["medfilt2d"] = dict(
+            (
+                ("image", self.cl_mem["image"]),
+                ("result", self.cl_mem["result"]),
+                ("local", self.local_mem),
+                (
+                    "khs1",
+                    numpy.int32(self.kernel_size[0] // 2),
+                ),  # Kernel half-size along dim1 (lines)
+                (
+                    "khs2",
+                    numpy.int32(self.kernel_size[1] // 2),
+                ),  # Kernel half-size along dim2 (columns)
+                ("height", numpy.int32(self.shape[0])),  # Image size along dim1 (lines)
+                ("width", numpy.int32(self.shape[1])),
+            )
+        )
+
+    #                                                ('debug', self.cl_mem["debug"])))  # Image size along dim2 (columns))
 
     def _get_local_mem(self, wg):
         return pyopencl.LocalMemory(wg * 32)  # 4byte per float, 8 element per thread
@@ -128,13 +156,26 @@ class MedianFilter2D(OpenclProcessing):
         dest_type = numpy.dtype([i.dtype for i in self.buffers if i.name == dest][0])
         events = []
         if (data.dtype == dest_type) or (data.dtype.itemsize > dest_type.itemsize):
-            copy_image = pyopencl.enqueue_copy(self.queue, self.cl_mem[dest], numpy.ascontiguousarray(data, dest_type))
+            copy_image = pyopencl.enqueue_copy(
+                self.queue, self.cl_mem[dest], numpy.ascontiguousarray(data, dest_type)
+            )
             events.append(EventDescription("copy H->D %s" % dest, copy_image))
         else:
-            copy_image = pyopencl.enqueue_copy(self.queue, self.cl_mem["image_raw"], numpy.ascontiguousarray(data))
+            copy_image = pyopencl.enqueue_copy(
+                self.queue, self.cl_mem["image_raw"], numpy.ascontiguousarray(data)
+            )
             kernel = getattr(self.program, self.mapping[data.dtype.type])
-            cast_to_float = kernel(self.queue, (self.size,), None, self.cl_mem["image_raw"], self.cl_mem[dest])
-            events += [EventDescription("copy H->D %s" % dest, copy_image), EventDescription("cast to float", cast_to_float)]
+            cast_to_float = kernel(
+                self.queue,
+                (self.size,),
+                None,
+                self.cl_mem["image_raw"],
+                self.cl_mem[dest],
+            )
+            events += [
+                EventDescription("copy H->D %s" % dest, copy_image),
+                EventDescription("cast to float", cast_to_float),
+            ]
         if self.profile:
             self.events += events
 
@@ -183,7 +224,9 @@ class MedianFilter2D(OpenclProcessing):
         amws = kernel_workgroup_size(self.program, "medfilt2d")
         logger.warning("max actual workgroup size: %s, expected: %s", amws, wg)
         if wg > amws:
-            raise RuntimeError("Workgroup size is too big for medfilt2d: %s>%s" % (wg, amws))
+            raise RuntimeError(
+                "Workgroup size is too big for medfilt2d: %s>%s" % (wg, amws)
+            )
 
         localmem = self._get_local_mem(wg)
 
@@ -200,11 +243,11 @@ class MedianFilter2D(OpenclProcessing):
             kwargs["khs2"] = kernel_half_size[1]
             kwargs["height"] = numpy.int32(image.shape[0])
             kwargs["width"] = numpy.int32(image.shape[1])
-#             for k, v in kwargs.items():
-#                 print("%s: %s (%s)" % (k, v, type(v)))
-            mf2d = self.kernels.medfilt2d(self.queue,
-                                          (wg, image.shape[1]),
-                                          (wg, 1), *list(kwargs.values()))
+            #             for k, v in kwargs.items():
+            #                 print("%s: %s (%s)" % (k, v, type(v)))
+            mf2d = self.kernels.medfilt2d(
+                self.queue, (wg, image.shape[1]), (wg, 1), *list(kwargs.values())
+            )
             events.append(EventDescription("median filter 2d", mf2d))
 
             result = numpy.empty(image.shape, numpy.float32)
@@ -214,12 +257,12 @@ class MedianFilter2D(OpenclProcessing):
         if self.profile:
             self.events += events
         return result
+
     __call__ = medfilt2d
 
     @staticmethod
     def calc_kernel_size(kernel_size):
-        """format the kernel size to be a 2-length numpy array of int32
-        """
+        """format the kernel size to be a 2-length numpy array of int32"""
         kernel_size = numpy.asarray(kernel_size, dtype=numpy.int32)
         if kernel_size.shape == ():
             kernel_size = numpy.repeat(kernel_size.item(), 2).astype(numpy.int32)
@@ -252,7 +295,7 @@ class _MedFilt2d(object):
 
         * The filling mode in scipy.signal.medfilt2d is zero-padding
         * This implementation is equivalent to:
-            scipy.ndimage.filters.median_filter(ary, kernel_size, mode="nearest")
+            scipy.ndimage.median_filter(ary, kernel_size, mode="nearest")
 
         """
         image = numpy.atleast_2d(ary)
@@ -266,4 +309,5 @@ class _MedFilt2d(object):
             cls.median_filter = MedianFilter2D(new_shape, kernel_size, ctx=ctx)
         return cls.median_filter.medfilt2d(image, kernel_size=kernel_size)
 
+
 medfilt2d = _MedFilt2d.medfilt2d
diff --git a/src/silx/opencl/processing.py b/src/silx/opencl/processing.py
index 8b81f7f..6db21d0 100644
--- a/src/silx/opencl/processing.py
+++ b/src/silx/opencl/processing.py
@@ -1,10 +1,9 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 #
 #    Project: S I L X project
 #             https://github.com/silx-kit/silx
 #
-#    Copyright (C) 2012-2018 European Synchrotron Radiation Facility, Grenoble, France
+#    Copyright (C) 2012-2023 European Synchrotron Radiation Facility, Grenoble, France
 #
 #    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
 #
@@ -38,22 +37,32 @@ __author__ = "Jerome Kieffer"
 __contact__ = "Jerome.Kieffer@ESRF.eu"
 __license__ = "MIT"
 __copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "02/03/2021"
+__date__ = "09/11/2022"
 __status__ = "stable"
 
 import sys
 import os
 import logging
 import gc
-from collections import namedtuple, OrderedDict
+from collections import namedtuple
 import numpy
 import threading
-from .common import ocl, pyopencl, release_cl_buffers, query_kernel_info, allocate_texture, check_textures_availability
+from .common import (
+    ocl,
+    pyopencl,
+    release_cl_buffers,
+    query_kernel_info,
+    allocate_texture,
+    check_textures_availability,
+)
 from .utils import concatenate_cl_kernel
 import platform
 
 BufferDescription = namedtuple("BufferDescription", ["name", "size", "dtype", "flags"])
-EventDescription = namedtuple("EventDescription", ["name", "event"])
+EventDescription = namedtuple(
+    "EventDescription", ["name", "event"]
+)  # Deprecated, please use ProfileDescription
+ProfileDescription = namedtuple("ProfileDescription", ["name", "start", "stop"])
 
 logger = logging.getLogger(__name__)
 
@@ -72,8 +81,9 @@ class KernelContainer(object):
 
     def get_kernels(self):
         "return the dictionary with all kernels"
-        return dict(item for item in self.__dict__.items()
-                    if not item[0].startswith("_"))
+        return dict(
+            item for item in self.__dict__.items() if not item[0].startswith("_")
+        )
 
     def get_kernel(self, name):
         "get a kernel from its name"
@@ -96,7 +106,9 @@ class KernelContainer(object):
         else:
             kernel = self.get_kernel(kernel_name)
 
-        return query_kernel_info(self._program, kernel, "PREFERRED_WORK_GROUP_SIZE_MULTIPLE")
+        return query_kernel_info(
+            self._program, kernel, "PREFERRED_WORK_GROUP_SIZE_MULTIPLE"
+        )
 
 
 class OpenclProcessing(object):
@@ -108,14 +120,24 @@ class OpenclProcessing(object):
     * Functions to compile kernels, cache them and clean them
     * helper functions to clone the object
     """
+
     # Example of how to create an output buffer of 10 floats
-    buffers = [BufferDescription("output", 10, numpy.float32, None),
-               ]
+    buffers = [
+        BufferDescription("output", 10, numpy.float32, None),
+    ]
     # list of kernel source files to be concatenated before compilation of the program
     kernel_files = []
 
-    def __init__(self, ctx=None, devicetype="all", platformid=None, deviceid=None,
-                 block_size=None, memory=None, profile=False):
+    def __init__(
+        self,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        block_size=None,
+        memory=None,
+        profile=False,
+    ):
         """Constructor of the abstract OpenCL processing class
 
         :param ctx: actual working context, left to None for automatic
@@ -140,9 +162,12 @@ class OpenclProcessing(object):
         if ctx:
             self.ctx = ctx
         else:
-            self.ctx = ocl.create_context(devicetype=devicetype,
-                                          platformid=platformid, deviceid=deviceid,
-                                          memory=memory)
+            self.ctx = ocl.create_context(
+                devicetype=devicetype,
+                platformid=platformid,
+                deviceid=deviceid,
+                memory=memory,
+            )
         device_name = self.ctx.devices[0].name.strip()
         platform_name = self.ctx.devices[0].platform.name.strip()
         platform = ocl.get_platform(platform_name)
@@ -158,8 +183,7 @@ class OpenclProcessing(object):
         return check_textures_availability(self.ctx)
 
     def __del__(self):
-        """Destructor: release all buffers and programs
-        """
+        """Destructor: release all buffers and programs"""
         try:
             self.reset_log()
             self.free_kernels()
@@ -201,19 +225,27 @@ class OpenclProcessing(object):
             ualloc = 0
             for buf in buffers:
                 ualloc += numpy.dtype(buf.dtype).itemsize * numpy.prod(buf.size)
-            logger.info("%.3fMB are needed on device: %s,  which has %.3fMB",
-                        ualloc / 1.0e6, self.device, self.device.memory / 1.0e6)
+            logger.info(
+                "%.3fMB are needed on device: %s,  which has %.3fMB",
+                ualloc / 1.0e6,
+                self.device,
+                self.device.memory / 1.0e6,
+            )
 
             if ualloc >= self.device.memory:
-                raise MemoryError("Fatal error in allocate_buffers. Not enough "
-                                  " device memory for buffers (%lu requested, %lu available)"
-                                  % (ualloc, self.device.memory))
+                raise MemoryError(
+                    "Fatal error in allocate_buffers. Not enough "
+                    " device memory for buffers (%lu requested, %lu available)"
+                    % (ualloc, self.device.memory)
+                )
 
             # do the allocation
             try:
                 if use_array:
                     for buf in buffers:
-                        mem[buf.name] = pyopencl.array.empty(self.queue, buf.size, buf.dtype)
+                        mem[buf.name] = pyopencl.array.empty(
+                            self.queue, buf.size, buf.dtype
+                        )
                 else:
                     for buf in buffers:
                         size = numpy.dtype(buf.dtype).itemsize * numpy.prod(buf.size)
@@ -241,8 +273,7 @@ class OpenclProcessing(object):
         return self.kernels.max_workgroup_size(kernel_name)
 
     def free_buffers(self):
-        """free all device.memory allocated on the device
-        """
+        """free all device.memory allocated on the device"""
         with self.sem:
             for key, buf in list(self.cl_mem.items()):
                 if buf is not None:
@@ -272,20 +303,22 @@ class OpenclProcessing(object):
         compile_options = compile_options or self.get_compiler_options()
         logger.info("Compiling file %s with options %s", kernel_files, compile_options)
         try:
-            self.program = pyopencl.Program(self.ctx, kernel_src).build(options=compile_options)
+            self.program = pyopencl.Program(self.ctx, kernel_src).build(
+                options=compile_options
+            )
         except (pyopencl.MemoryError, pyopencl.LogicError) as error:
             raise MemoryError(error)
         else:
             self.kernels = KernelContainer(self.program)
 
     def free_kernels(self):
-        """Free all kernels
-        """
+        """Free all kernels"""
         for kernel in self.cl_kernel_args:
             self.cl_kernel_args[kernel] = []
         self.kernels = None
         self.program = None
 
+    # Methods about Profiling
     def set_profiling(self, value=True):
         """Switch On/Off the profiling flag of the command queue to allow debugging
 
@@ -300,85 +333,121 @@ class OpenclProcessing(object):
                 if self.queue is not None:
                     self.queue.finish()
                 if self.profile:
-                    self.queue = pyopencl.CommandQueue(self.ctx,
-                        properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+                    self.queue = pyopencl.CommandQueue(
+                        self.ctx,
+                        properties=pyopencl.command_queue_properties.PROFILING_ENABLE,
+                    )
                 else:
                     self.queue = pyopencl.CommandQueue(self.ctx)
+                # Update all memory-objects with the new queue:
+                for obj, cl_obj in list(self.cl_mem.items()):
+                    if isinstance(cl_obj, pyopencl.array.Array):
+                        self.cl_mem[obj] = cl_obj.with_queue(self.queue)
 
     def profile_add(self, event, desc):
         """
         Add an OpenCL event to the events lists, if profiling is enabled.
 
-        :param event: silx.opencl.processing.EventDescription.
+        :param event: pyopencl.NanyEvent.
         :param desc: event description
         """
         if self.profile:
-            self.events.append(EventDescription(desc, event))
-
-    def allocate_texture(self, shape, hostbuf=None, support_1D=False):
-        return allocate_texture(self.ctx, shape, hostbuf=hostbuf, support_1D=support_1D)
+            try:
+                profile = event.profile
+                self.events.append(ProfileDescription(desc, profile.start, profile.end))
+            except Exception:
+                # Probably the driver does not support profiling
+                pass
 
-    def transfer_to_texture(self, arr, tex_ref):
+    def profile_multi(self, event_lists):
         """
-        Transfer an array to a texture.
+        Extract profiling info from several OpenCL event, if profiling is enabled.
 
-        :param arr: Input array. Can be a numpy array or a pyopencl array.
-        :param tex_ref: texture reference (pyopencl._cl.Image).
+        :param event_lists: list of ("desc", pyopencl.NanyEvent).
         """
-        copy_args = [self.queue, tex_ref, arr]
-        shp = arr.shape
-        ndim = arr.ndim
-        if ndim == 1:
-            # pyopencl and OpenCL < 1.2 do not support image1d_t
-            # force 2D with one row in this case
-            # ~ ndim = 2
-            shp = (1,) + shp
-        copy_kwargs = {"origin":(0,) * ndim, "region": shp[::-1]}
-        if not(isinstance(arr, numpy.ndarray)):  # assuming pyopencl.array.Array
-            # D->D copy
-            copy_args[2] = arr.data
-            copy_kwargs["offset"] = 0
-        ev = pyopencl.enqueue_copy(*copy_args, **copy_kwargs)
-        self.profile_add(ev, "Transfer to texture")
+        if self.profile:
+            for event_desc in event_lists:
+                if isinstance(event_desc, ProfileDescription):
+                    self.events.append(event_desc)
+                else:
+                    if (
+                        isinstance(event_desc, EventDescription)
+                        or "__len__" in dir(event_desc)
+                        and len(event_desc) == 2
+                    ):
+                        desc, event = event_desc
+                    else:
+                        desc = "?"
+                        event = event_desc
+                    try:
+                        profile = event.profile
+                        start = profile.start
+                        end = profile.end
+                    except Exception:
+                        # probably an unfinished job ... use old-style.
+                        self.events.append(event_desc)
+                    else:
+                        self.events.append(ProfileDescription(desc, start, end))
 
     def log_profile(self, stats=False):
         """If we are in profiling mode, prints out all timing for every single OpenCL call
-        
+
         :param stats: if True, prints the statistics on each kernel instead of all execution timings
         :return: list of lines to print
         """
         total_time = 0.0
         out = [""]
         if stats:
-            stats = OrderedDict()
-            out.append(f"OpenCL kernel profiling statistics in milliseconds for: {self.__class__.__name__}")
-            out.append(f"{'Kernel name':>50} (count):      min   median      max     mean      std")
+            stats = {}
+            out.append(
+                f"OpenCL kernel profiling statistics in milliseconds for: {self.__class__.__name__}"
+            )
+            out.append(
+                f"{'Kernel name':>50} (count):      min   median      max     mean      std"
+            )
         else:
             stats = None
             out.append(f"Profiling info for OpenCL: {self.__class__.__name__}")
 
         if self.profile:
             for e in self.events:
-                if "__len__" in dir(e) and len(e) >= 2:
+                if isinstance(e, ProfileDescription):
+                    name = e[0]
+                    t0 = e[1]
+                    t1 = e[2]
+                elif (
+                    isinstance(e, EventDescription)
+                    or "__len__" in dir(e)
+                    and len(e) == 2
+                ):
                     name = e[0]
                     pr = e[1].profile
                     t0 = pr.start
                     t1 = pr.end
-                    et = 1e-6 * (t1 - t0)
-                    total_time += et
-                    if stats is None:
-                        out.append(f"{name:>50}        : {et:.3f}ms")
+                else:
+                    name = "?"
+                    t0 = e.profile.start
+                    t1 = e.profile.end
+
+                et = 1e-6 * (t1 - t0)
+                total_time += et
+                if stats is None:
+                    out.append(f"{name:>50}        : {et:.3f}ms")
+                else:
+                    if name in stats:
+                        stats[name].append(et)
                     else:
-                        if name in stats:
-                            stats[name].append(et)
-                        else:
-                            stats[name] = [et]
+                        stats[name] = [et]
             if stats is not None:
                 for k, v in stats.items():
                     n = numpy.array(v)
-                    out.append(f"{k:>50} ({len(v):5}): {n.min():8.3f} {numpy.median(n):8.3f} {n.max():8.3f} {n.mean():8.3f} {n.std():8.3f}")
+                    out.append(
+                        f"{k:>50} ({len(v):5}): {n.min():8.3f} {numpy.median(n):8.3f} {n.max():8.3f} {n.mean():8.3f} {n.std():8.3f}"
+                    )
             out.append("_" * 80)
-            out.append(f"{'Total OpenCL execution time':>50}        : {total_time:.3f}ms")
+            out.append(
+                f"{'Total OpenCL execution time':>50}        : {total_time:.3f}ms"
+            )
 
         logger.info(os.linesep.join(out))
         return out
@@ -390,13 +459,42 @@ class OpenclProcessing(object):
         with self.sem:
             self.events = []
 
+    # Methods about textures
+    def allocate_texture(self, shape, hostbuf=None, support_1D=False):
+        return allocate_texture(self.ctx, shape, hostbuf=hostbuf, support_1D=support_1D)
+
+    def transfer_to_texture(self, arr, tex_ref):
+        """
+        Transfer an array to a texture.
+
+        :param arr: Input array. Can be a numpy array or a pyopencl array.
+        :param tex_ref: texture reference (pyopencl._cl.Image).
+        """
+        copy_args = [self.queue, tex_ref, arr]
+        shp = arr.shape
+        ndim = arr.ndim
+        if ndim == 1:
+            # pyopencl and OpenCL < 1.2 do not support image1d_t
+            # force 2D with one row in this case
+            # ~ ndim = 2
+            shp = (1,) + shp
+        copy_kwargs = {"origin": (0,) * ndim, "region": shp[::-1]}
+        if not (isinstance(arr, numpy.ndarray)):  # assuming pyopencl.array.Array
+            # D->D copy
+            copy_args[2] = arr.data
+            copy_kwargs["offset"] = 0
+        ev = pyopencl.enqueue_copy(*copy_args, **copy_kwargs)
+        self.profile_add(ev, "Transfer to texture")
+
     @property
     def x87_volatile_option(self):
         # this is running 32 bits OpenCL woth POCL
         if self._X87_VOLATILE is None:
-            if (platform.machine() in ("i386", "i686", "x86_64", "AMD64") and
-                    (tuple.__itemsize__ == 4) and
-                    self.ctx.devices[0].platform.name == 'Portable Computing Language'):
+            if (
+                platform.machine() in ("i386", "i686", "x86_64", "AMD64")
+                and (tuple.__itemsize__ == 4)
+                and self.ctx.devices[0].platform.name == "Portable Computing Language"
+            ):
                 self._X87_VOLATILE = "-DX87_VOLATILE=volatile"
             else:
                 self._X87_VOLATILE = ""
@@ -413,6 +511,7 @@ class OpenclProcessing(object):
             option_list.append(self.x87_volatile_option)
         return " ".join(i for i in option_list if i)
 
+
 # This should be implemented by concrete class
 #     def __copy__(self):
 #         """Shallow copy of the object
diff --git a/src/silx/opencl/projection.py b/src/silx/opencl/projection.py
index c02faf6..cf4b625 100644
--- a/src/silx/opencl/projection.py
+++ b/src/silx/opencl/projection.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
 # Copyright (c) 2016-2020 European Synchrotron Radiation Facility
@@ -25,8 +24,6 @@
 # ###########################################################################*/
 """Module for tomographic projector on the GPU"""
 
-from __future__ import absolute_import, print_function, with_statement, division
-
 __authors__ = ["A. Mirone, P. Paleo"]
 __license__ = "MIT"
 __date__ = "01/08/2019"
@@ -51,14 +48,23 @@ class Projection(OpenclProcessing):
     A class for performing a tomographic projection (Radon Transform) using
     OpenCL
     """
+
     kernel_files = ["proj.cl", "array_utils.cl"]
     logger.warning("Forward Projecter is untested and unsuported for now")
 
-    def __init__(self, slice_shape, angles, axis_position=None,
-                 detector_width=None, normalize=False, ctx=None,
-                 devicetype="all", platformid=None, deviceid=None,
-                 profile=False
-                 ):
+    def __init__(
+        self,
+        slice_shape,
+        angles,
+        axis_position=None,
+        detector_width=None,
+        normalize=False,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        profile=False,
+    ):
         """Constructor of the OpenCL projector.
 
         :param slice_shape: shape of the slice: (num_rows, num_columns).
@@ -87,9 +93,14 @@ class Projection(OpenclProcessing):
         # if sys.platform.startswith('darwin'): # assuming no discrete GPU
         #    raise NotImplementedError("Backprojection is not implemented on CPU for OS X yet")
 
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  profile=profile)
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            profile=profile,
+        )
         self.shape = slice_shape
         self.axis_pos = axis_position
         self.angles = angles
@@ -98,24 +109,27 @@ class Projection(OpenclProcessing):
 
         # Default values
         if self.axis_pos is None:
-            self.axis_pos = (self.shape[1] - 1) / 2.
+            self.axis_pos = (self.shape[1] - 1) / 2.0
         if self.dwidth is None:
             self.dwidth = self.shape[1]
-        if not(np.iterable(self.angles)):
+        if not (np.iterable(self.angles)):
             if self.angles is None:
                 self.nprojs = self.shape[0]
             else:
                 self.nprojs = self.angles
-            self.angles = np.linspace(start=0,
-                                      stop=np.pi,
-                                      num=self.nprojs,
-                                      endpoint=False).astype(dtype=np.float32)
+            self.angles = np.linspace(
+                start=0, stop=np.pi, num=self.nprojs, endpoint=False
+            ).astype(dtype=np.float32)
         else:
             self.nprojs = len(self.angles)
-        self.offset_x = -np.float32((self.shape[1] - 1) / 2. - self.axis_pos)  # TODO: custom
-        self.offset_y = -np.float32((self.shape[0] - 1) / 2. - self.axis_pos)  # TODO: custom
+        self.offset_x = -np.float32(
+            (self.shape[1] - 1) / 2.0 - self.axis_pos
+        )  # TODO: custom
+        self.offset_y = -np.float32(
+            (self.shape[0] - 1) / 2.0 - self.axis_pos
+        )  # TODO: custom
         # Reset axis_pos once offset are computed
-        self.axis_pos0 = np.float64((self.shape[1] - 1) / 2.)
+        self.axis_pos0 = np.float64((self.shape[1] - 1) / 2.0)
 
         # Workgroup, ndrange and shared size
         self.dimgrid_x = _idivup(self.dwidth, 16)
@@ -126,118 +140,122 @@ class Projection(OpenclProcessing):
         self.wg = (16, 16)
         self.ndrange = (
             int(self.dimgrid_x) * self.wg[0],  # int(): pyopencl <= 2015.1
-            int(self.dimgrid_y) * self.wg[1]  # int(): pyopencl <= 2015.1
+            int(self.dimgrid_y) * self.wg[1],  # int(): pyopencl <= 2015.1
         )
 
         self._use_textures = self.check_textures_availability()
 
         # Allocate memory
         self.buffers = [
-            BufferDescription("_d_sino", self._dimrecx * self._dimrecy, np.float32, mf.READ_WRITE),
+            BufferDescription(
+                "_d_sino", self._dimrecx * self._dimrecy, np.float32, mf.READ_WRITE
+            ),
             BufferDescription("d_angles", self._dimrecy, np.float32, mf.READ_ONLY),
             BufferDescription("d_beginPos", self._dimrecy * 2, np.int32, mf.READ_ONLY),
-            BufferDescription("d_strideJoseph", self._dimrecy * 2, np.int32, mf.READ_ONLY),
-            BufferDescription("d_strideLine", self._dimrecy * 2, np.int32, mf.READ_ONLY),
+            BufferDescription(
+                "d_strideJoseph", self._dimrecy * 2, np.int32, mf.READ_ONLY
+            ),
+            BufferDescription(
+                "d_strideLine", self._dimrecy * 2, np.int32, mf.READ_ONLY
+            ),
         ]
         d_axis_corrections = parray.empty(self.queue, self.nprojs, np.float32)
         d_axis_corrections.fill(np.float32(0.0))
-        self.add_to_cl_mem(
-            {
-                "d_axis_corrections": d_axis_corrections
-            }
+        self.add_to_cl_mem({"d_axis_corrections": d_axis_corrections})
+        self._tmp_extended_img = np.zeros(
+            (self.shape[0] + 2, self.shape[1] + 2), dtype=np.float32
         )
-        self._tmp_extended_img = np.zeros((self.shape[0] + 2, self.shape[1] + 2),
-                                          dtype=np.float32)
-        if not(self._use_textures):
+        if not (self._use_textures):
             self.allocate_slice()
         else:
             self.allocate_textures()
         self.allocate_buffers()
-        self._ex_sino = np.zeros((self._dimrecy, self._dimrecx),
-                                 dtype=np.float32)
-        if not(self._use_textures):
-            self.cl_mem["d_slice"].fill(0.)
+        self._ex_sino = np.zeros((self._dimrecy, self._dimrecx), dtype=np.float32)
+        if not (self._use_textures):
+            self.cl_mem["d_slice"].fill(0.0)
             # enqueue_fill_buffer has issues if opencl 1.2 is not present
             # ~ pyopencl.enqueue_fill_buffer(
-                # ~ self.queue,
-                # ~ self.cl_mem["d_slice"],
-                # ~ np.float32(0),
-                # ~ 0,
-                # ~ self._tmp_extended_img.size * _sizeof(np.float32)
+            # ~ self.queue,
+            # ~ self.cl_mem["d_slice"],
+            # ~ np.float32(0),
+            # ~ 0,
+            # ~ self._tmp_extended_img.size * _sizeof(np.float32)
             # ~ )
         # Precomputations
         self.compute_angles()
         self.proj_precomputations()
-        self.cl_mem["d_axis_corrections"].fill(0.)
+        self.cl_mem["d_axis_corrections"].fill(0.0)
         # enqueue_fill_buffer has issues if opencl 1.2 is not present
         # ~ pyopencl.enqueue_fill_buffer(
-                                    # ~ self.queue,
-                                    # ~ self.cl_mem["d_axis_corrections"],
-                                    # ~ np.float32(0),
-                                    # ~ 0,
-                                    # ~ self.nprojs*_sizeof(np.float32)
-                                    # ~ )
+        # ~ self.queue,
+        # ~ self.cl_mem["d_axis_corrections"],
+        # ~ np.float32(0),
+        # ~ 0,
+        # ~ self.nprojs*_sizeof(np.float32)
+        # ~ )
         # Shorthands
         self._d_sino = self.cl_mem["_d_sino"]
 
         compile_options = None
-        if not(self._use_textures):
+        if not (self._use_textures):
             compile_options = "-DDONT_USE_TEXTURES"
         OpenclProcessing.compile_kernels(
-            self,
-            self.kernel_files,
-            compile_options=compile_options
+            self, self.kernel_files, compile_options=compile_options
         )
         # check that workgroup can actually be (16, 16)
-        self.compiletime_workgroup_size = self.kernels.max_workgroup_size("forward_kernel_cpu")
+        self.compiletime_workgroup_size = self.kernels.max_workgroup_size(
+            "forward_kernel_cpu"
+        )
 
     def compute_angles(self):
         angles2 = np.zeros(self._dimrecy, dtype=np.float32)  # dimrecy != num_projs
-        angles2[:self.nprojs] = np.copy(self.angles)
-        angles2[self.nprojs:] = angles2[self.nprojs - 1]
+        angles2[: self.nprojs] = np.copy(self.angles)
+        angles2[self.nprojs :] = angles2[self.nprojs - 1]
         self.angles2 = angles2
         pyopencl.enqueue_copy(self.queue, self.cl_mem["d_angles"], angles2)
 
     def allocate_slice(self):
-        ary = parray.empty(self.queue, (self.shape[1] + 2, self.shape[1] + 2), np.float32)
+        ary = parray.empty(
+            self.queue, (self.shape[1] + 2, self.shape[1] + 2), np.float32
+        )
         ary.fill(0)
         self.add_to_cl_mem({"d_slice": ary})
 
     def allocate_textures(self):
         self.d_image_tex = pyopencl.Image(
-                self.ctx,
-                mf.READ_ONLY | mf.USE_HOST_PTR,
-                pyopencl.ImageFormat(
-                    pyopencl.channel_order.INTENSITY,
-                    pyopencl.channel_type.FLOAT
-                ), hostbuf=np.ascontiguousarray(self._tmp_extended_img.T),
-            )
+            self.ctx,
+            mf.READ_ONLY | mf.USE_HOST_PTR,
+            pyopencl.ImageFormat(
+                pyopencl.channel_order.INTENSITY, pyopencl.channel_type.FLOAT
+            ),
+            hostbuf=np.ascontiguousarray(self._tmp_extended_img.T),
+        )
 
     def transfer_to_texture(self, image):
         image2 = image
-        if not(image.flags["C_CONTIGUOUS"] and image.dtype == np.float32):
+        if not (image.flags["C_CONTIGUOUS"] and image.dtype == np.float32):
             image2 = np.ascontiguousarray(image)
-        if not(self._use_textures):
+        if not (self._use_textures):
             # TODO: create NoneEvent
             return self.transfer_to_slice(image2)
             # ~ return pyopencl.enqueue_copy(
-                        # ~ self.queue,
-                        # ~ self.cl_mem["d_slice"].data,
-                        # ~ image2,
-                        # ~ origin=(1, 1),
-                        # ~ region=image.shape[::-1]
-                        # ~ )
+            # ~ self.queue,
+            # ~ self.cl_mem["d_slice"].data,
+            # ~ image2,
+            # ~ origin=(1, 1),
+            # ~ region=image.shape[::-1]
+            # ~ )
         else:
             return pyopencl.enqueue_copy(
-                       self.queue,
-                       self.d_image_tex,
-                       image2,
-                       origin=(1, 1),
-                       region=image.shape[::-1]
-                   )
+                self.queue,
+                self.d_image_tex,
+                image2,
+                origin=(1, 1),
+                region=image.shape[::-1],
+            )
 
     def transfer_device_to_texture(self, d_image):
-        if not(self._use_textures):
+        if not (self._use_textures):
             # TODO this copy should not be necessary
             return self.cpy2d_to_slice(d_image)
         else:
@@ -247,7 +265,10 @@ class Projection(OpenclProcessing):
                 d_image,
                 offset=0,
                 origin=(1, 1),
-                region=(int(self.shape[1]), int(self.shape[0]))  # self.shape[::-1] # pyopencl <= 2015.2
+                region=(
+                    int(self.shape[1]),
+                    int(self.shape[0]),
+                ),  # self.shape[::-1] # pyopencl <= 2015.2
             )
 
     def transfer_to_slice(self, image):
@@ -326,7 +347,7 @@ class Projection(OpenclProcessing):
             np.int32(self._dimrecx),
             np.int32((0, 0)),
             np.int32((0, 0)),
-            sino_shape_ocl
+            sino_shape_ocl,
         )
         return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args)
 
@@ -334,7 +355,10 @@ class Projection(OpenclProcessing):
         """
         copy a Nx * Ny slice to self.d_slice which is (Nx+2)*(Ny+2)
         """
-        ndrange = (int(self.shape[1]), int(self.shape[0]))  # self.shape[::-1] # pyopencl < 2015.2
+        ndrange = (
+            int(self.shape[1]),
+            int(self.shape[0]),
+        )  # self.shape[::-1] # pyopencl < 2015.2
         wg = None
         slice_shape_ocl = np.int32(ndrange)
         kernel_args = (
@@ -344,7 +368,7 @@ class Projection(OpenclProcessing):
             np.int32(self.shape[1]),
             np.int32((1, 1)),
             np.int32((0, 0)),
-            slice_shape_ocl
+            slice_shape_ocl,
         )
         return self.kernels.cpy2d(self.queue, ndrange, wg, *kernel_args)
 
@@ -367,7 +391,7 @@ class Projection(OpenclProcessing):
                     self.transfer_to_slice(image)
                     slice_ref = self.cl_mem["d_slice"].data
             else:
-                if not(self._use_textures):
+                if not (self._use_textures):
                     slice_ref = self.cl_mem["d_slice"].data
                 else:
                     slice_ref = self.d_image_tex
@@ -389,23 +413,17 @@ class Projection(OpenclProcessing):
                 self.offset_x,
                 self.offset_y,
                 np.int32(1),  # josephnoclip, 1 by default
-                np.int32(self.normalize)
+                np.int32(self.normalize),
             )
 
             # Call the kernel
-            if not(self._use_textures):
+            if not (self._use_textures):
                 event_pj = self.kernels.forward_kernel_cpu(
-                    self.queue,
-                    self.ndrange,
-                    self.wg,
-                    *kernel_args
+                    self.queue, self.ndrange, self.wg, *kernel_args
                 )
             else:
                 event_pj = self.kernels.forward_kernel(
-                    self.queue,
-                    self.ndrange,
-                    self.wg,
-                    *kernel_args
+                    self.queue, self.ndrange, self.wg, *kernel_args
                 )
             events.append(EventDescription("projection", event_pj))
             if dst is None:
@@ -413,7 +431,7 @@ class Projection(OpenclProcessing):
                 ev = pyopencl.enqueue_copy(self.queue, self._ex_sino, self._d_sino)
                 events.append(EventDescription("copy D->H result", ev))
                 ev.wait()
-                res = np.copy(self._ex_sino[:self.nprojs, :self.dwidth])
+                res = np.copy(self._ex_sino[: self.nprojs, : self.dwidth])
             else:
                 ev = self.cpy2d_to_sino(dst)
                 events.append(EventDescription("copy D->D result", ev))
diff --git a/src/silx/opencl/reconstruction.py b/src/silx/opencl/reconstruction.py
index 2c84aee..c80a0ef 100644
--- a/src/silx/opencl/reconstruction.py
+++ b/src/silx/opencl/reconstruction.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
 # Copyright (c) 2016 European Synchrotron Radiation Facility
@@ -25,8 +24,6 @@
 # ###########################################################################*/
 """Module for tomographic reconstruction algorithms"""
 
-from __future__ import absolute_import, print_function, with_statement, division
-
 __authors__ = ["P. Paleo"]
 __license__ = "MIT"
 __date__ = "01/08/2019"
@@ -42,6 +39,7 @@ from .linalg import LinAlg
 
 import pyopencl.array as parray
 from pyopencl.elementwise import ElementwiseKernel
+
 logger = logging.getLogger(__name__)
 
 cl = pyopencl
@@ -68,13 +66,26 @@ class ReconstructionAlgorithm(OpenclProcessing):
                     store profiling elements (makes code slightly slower)
     """
 
-    def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None,
-                 ctx=None, devicetype="all", platformid=None, deviceid=None,
-                 profile=False
-                 ):
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  profile=profile)
+    def __init__(
+        self,
+        sino_shape,
+        slice_shape=None,
+        axis_position=None,
+        angles=None,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        profile=False,
+    ):
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            profile=profile,
+        )
 
         # Create a backprojector
         self.backprojector = Backprojection(
@@ -83,7 +94,7 @@ class ReconstructionAlgorithm(OpenclProcessing):
             axis_position=axis_position,
             angles=angles,
             ctx=self.ctx,
-            profile=profile
+            profile=profile,
         )
         # Create a projector
         self.projector = Projection(
@@ -93,7 +104,7 @@ class ReconstructionAlgorithm(OpenclProcessing):
             detector_width=self.backprojector.num_bins,
             normalize=False,
             ctx=self.ctx,
-            profile=profile
+            profile=profile,
         )
         self.sino_shape = sino_shape
         self.is_cpu = self.backprojector.is_cpu
@@ -102,32 +113,34 @@ class ReconstructionAlgorithm(OpenclProcessing):
         self.d_data.fill(0.0)
         self.d_sino = parray.empty_like(self.d_data)
         self.d_sino.fill(0.0)
-        self.d_x = parray.empty(self.queue,
-                                self.backprojector.slice_shape,
-                                dtype=np.float32)
+        self.d_x = parray.empty(
+            self.queue, self.backprojector.slice_shape, dtype=np.float32
+        )
         self.d_x.fill(0.0)
         self.d_x_old = parray.empty_like(self.d_x)
         self.d_x_old.fill(0.0)
 
-        self.add_to_cl_mem({
-                            "d_data": self.d_data,
-                            "d_sino": self.d_sino,
-                            "d_x": self.d_x,
-                            "d_x_old": self.d_x_old,
-                            })
+        self.add_to_cl_mem(
+            {
+                "d_data": self.d_data,
+                "d_sino": self.d_sino,
+                "d_x": self.d_x,
+                "d_x_old": self.d_x_old,
+            }
+        )
 
     def proj(self, d_slice, d_sino):
         """
         Project d_slice to d_sino
         """
-        self.projector.transfer_device_to_texture(d_slice.data)  #.wait()
+        self.projector.transfer_device_to_texture(d_slice.data)  # .wait()
         self.projector.projection(dst=d_sino)
 
     def backproj(self, d_sino, d_slice):
         """
         Backproject d_sino to d_slice
         """
-        self.backprojector.transfer_device_to_texture(d_sino.data)  #.wait()
+        self.backprojector.transfer_device_to_texture(d_sino.data)  # .wait()
         self.backprojector.backprojection(dst=d_slice)
 
 
@@ -156,15 +169,30 @@ class SIRT(ReconstructionAlgorithm):
             implementation.
     """
 
-    def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None,
-                 ctx=None, devicetype="all", platformid=None, deviceid=None,
-                 profile=False
-                 ):
-
-        ReconstructionAlgorithm.__init__(self, sino_shape, slice_shape=slice_shape,
-                                         axis_position=axis_position, angles=angles,
-                                         ctx=ctx, devicetype=devicetype, platformid=platformid,
-                                         deviceid=deviceid, profile=profile)
+    def __init__(
+        self,
+        sino_shape,
+        slice_shape=None,
+        axis_position=None,
+        angles=None,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        profile=False,
+    ):
+        ReconstructionAlgorithm.__init__(
+            self,
+            sino_shape,
+            slice_shape=slice_shape,
+            axis_position=axis_position,
+            angles=angles,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            profile=profile,
+        )
         self.compute_preconditioners()
 
     def compute_preconditioners(self):
@@ -181,26 +209,31 @@ class SIRT(ReconstructionAlgorithm):
 
         # r_{i,i} = 1/(sum_j a_{i,j})
         slice_ones = np.ones(self.backprojector.slice_shape, dtype=np.float32)
-        R = 1./self.projector.projection(slice_ones)  # could be all done on GPU, but I want extra checks
-        R[np.logical_not(np.isfinite(R))] = 1.  # In the case where the rotation axis is excentred
+        R = 1.0 / self.projector.projection(
+            slice_ones
+        )  # could be all done on GPU, but I want extra checks
+        R[
+            np.logical_not(np.isfinite(R))
+        ] = 1.0  # In the case where the rotation axis is excentred
         self.d_R = parray.to_device(self.queue, R)
         # c_{j,j} = 1/(sum_i a_{i,j})
         sino_ones = np.ones(self.sino_shape, dtype=np.float32)
-        C = 1./self.backprojector.backprojection(sino_ones)
-        C[np.logical_not(np.isfinite(C))] = 1.  # In the case where the rotation axis is excentred
+        C = 1.0 / self.backprojector.backprojection(sino_ones)
+        C[
+            np.logical_not(np.isfinite(C))
+        ] = 1.0  # In the case where the rotation axis is excentred
         self.d_C = parray.to_device(self.queue, C)
 
-        self.add_to_cl_mem({
-            "d_R": self.d_R,
-            "d_C": self.d_C
-        })
+        self.add_to_cl_mem({"d_R": self.d_R, "d_C": self.d_C})
 
     # TODO: compute and possibly return the residual
     def run(self, data, n_it):
         """
         Run n_it iterations of the SIRT algorithm.
         """
-        cl.enqueue_copy(self.queue, self.d_data.data, np.ascontiguousarray(data.astype(np.float32)))
+        cl.enqueue_copy(
+            self.queue, self.d_data.data, np.ascontiguousarray(data.astype(np.float32))
+        )
 
         d_x_old = self.d_x_old
         d_x = self.d_x
@@ -257,26 +290,44 @@ class TV(ReconstructionAlgorithm):
             the AMD opencl implementation.
     """
 
-    def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None,
-                 ctx=None, devicetype="all", platformid=None, deviceid=None,
-                 profile=False
-                 ):
-        ReconstructionAlgorithm.__init__(self, sino_shape, slice_shape=slice_shape,
-                                         axis_position=axis_position, angles=angles,
-                                         ctx=ctx, devicetype=devicetype, platformid=platformid,
-                                         deviceid=deviceid, profile=profile)
+    def __init__(
+        self,
+        sino_shape,
+        slice_shape=None,
+        axis_position=None,
+        angles=None,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        profile=False,
+    ):
+        ReconstructionAlgorithm.__init__(
+            self,
+            sino_shape,
+            slice_shape=slice_shape,
+            axis_position=axis_position,
+            angles=angles,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            profile=profile,
+        )
         self.compute_preconditioners()
 
         # Create a LinAlg instance
         self.linalg = LinAlg(self.backprojector.slice_shape, ctx=self.ctx)
         # Positivity constraint
-        self.elwise_clamp = ElementwiseKernel(self.ctx, "float *a", "a[i] = max(a[i], 0.0f);")
+        self.elwise_clamp = ElementwiseKernel(
+            self.ctx, "float *a", "a[i] = max(a[i], 0.0f);"
+        )
         # Projection onto the L-infinity ball of radius Lambda
         self.elwise_proj_linf = ElementwiseKernel(
             self.ctx,
             "float2* a, float Lambda",
             "a[i].x = copysign(min(fabs(a[i].x), Lambda), a[i].x); a[i].y = copysign(min(fabs(a[i].y), Lambda), a[i].y);",
-            "elwise_proj_linf"
+            "elwise_proj_linf",
         )
         # Additional arrays
         self.linalg.gradient(self.d_x)
@@ -287,11 +338,13 @@ class TV(ReconstructionAlgorithm):
         self.d_p.fill(0)
         self.d_q.fill(0)
         self.d_tmp.fill(0)
-        self.add_to_cl_mem({
-            "d_p": self.d_p,
-            "d_q": self.d_q,
-            "d_tmp": self.d_tmp,
-        })
+        self.add_to_cl_mem(
+            {
+                "d_p": self.d_p,
+                "d_q": self.d_q,
+                "d_tmp": self.d_tmp,
+            }
+        )
 
         self.theta = 1.0
 
@@ -311,30 +364,36 @@ class TV(ReconstructionAlgorithm):
 
         # Compute the diagonal preconditioner "Sigma"
         slice_ones = np.ones(self.backprojector.slice_shape, dtype=np.float32)
-        Sigma_k = 1./self.projector.projection(slice_ones)
-        Sigma_k[np.logical_not(np.isfinite(Sigma_k))] = 1.
+        Sigma_k = 1.0 / self.projector.projection(slice_ones)
+        Sigma_k[np.logical_not(np.isfinite(Sigma_k))] = 1.0
         self.d_Sigma_k = parray.to_device(self.queue, Sigma_k)
         self.d_Sigma_kp1 = self.d_Sigma_k + 1  # TODO: memory vs computation
-        self.Sigma_grad = 1/2.0  # For discrete gradient, sum|D_i,j| = 2 along lines or cols
+        self.Sigma_grad = (
+            1 / 2.0
+        )  # For discrete gradient, sum|D_i,j| = 2 along lines or cols
 
         # Compute the diagonal preconditioner "Tau"
         sino_ones = np.ones(self.sino_shape, dtype=np.float32)
         C = self.backprojector.backprojection(sino_ones)
-        Tau = 1./(C + 2.)
+        Tau = 1.0 / (C + 2.0)
         self.d_Tau = parray.to_device(self.queue, Tau)
 
-        self.add_to_cl_mem({
-            "d_Sigma_k": self.d_Sigma_k,
-            "d_Sigma_kp1": self.d_Sigma_kp1,
-            "d_Tau": self.d_Tau
-        })
+        self.add_to_cl_mem(
+            {
+                "d_Sigma_k": self.d_Sigma_k,
+                "d_Sigma_kp1": self.d_Sigma_kp1,
+                "d_Tau": self.d_Tau,
+            }
+        )
 
     def run(self, data, n_it, Lambda, pos_constraint=False):
         """
         Run n_it iterations of the TV-regularized reconstruction,
         with the regularization parameter Lambda.
         """
-        cl.enqueue_copy(self.queue, self.d_data.data, np.ascontiguousarray(data.astype(np.float32)))
+        cl.enqueue_copy(
+            self.queue, self.d_data.data, np.ascontiguousarray(data.astype(np.float32))
+        )
 
         d_x = self.d_x
         d_x_old = self.d_x_old
@@ -351,7 +410,7 @@ class TV(ReconstructionAlgorithm):
         for k in range(0, n_it):
             # Update primal variables
             d_x_old[:] = d_x[:]
-            #~ x = x + Tau*div(p) - Tau*Kadj(q)
+            # ~ x = x + Tau*div(p) - Tau*Kadj(q)
             self.backproj(d_q, d_tmp)
             self.linalg.divergence(d_p)
             # TODO: this in less than three ops (one kernel ?)
@@ -363,20 +422,20 @@ class TV(ReconstructionAlgorithm):
                 self.elwise_clamp(d_x)
 
             # Update dual variables
-            #~ p = proj_linf(p + Sigma_grad*gradient(x + theta*(x - x_old)), Lambda)
+            # ~ p = proj_linf(p + Sigma_grad*gradient(x + theta*(x - x_old)), Lambda)
             d_tmp[:] = d_x[:]
             # FIXME: mul_add is out of place, put an equivalent thing in linalg...
-            #~ d_tmp.mul_add(1 + theta, d_x_old, -theta)
-            d_tmp *= 1+self.theta
-            d_tmp -= self.theta*d_x_old
+            # ~ d_tmp.mul_add(1 + theta, d_x_old, -theta)
+            d_tmp *= 1 + self.theta
+            d_tmp -= self.theta * d_x_old
             self.linalg.gradient(d_tmp)
             # TODO: out of place mul_add
-            #~ d_p.mul_add(1, L.cl_mem["d_gradient"], Sigma_grad)
+            # ~ d_p.mul_add(1, L.cl_mem["d_gradient"], Sigma_grad)
             self.linalg.cl_mem["d_gradient"] *= self.Sigma_grad
             d_p += self.linalg.cl_mem["d_gradient"]
             self.elwise_proj_linf(d_p, Lambda)
 
-            #~ q = (q + Sigma_k*K(x + theta*(x - x_old)) - Sigma_k*data)/(1.0 + Sigma_k)
+            # ~ q = (q + Sigma_k*K(x + theta*(x - x_old)) - Sigma_k*data)/(1.0 + Sigma_k)
             self.proj(d_tmp, d_sino)
             # TODO: this in less instructions
             d_sino -= self.d_data
diff --git a/src/silx/opencl/setup.py b/src/silx/opencl/setup.py
deleted file mode 100644
index 10fb1be..0000000
--- a/src/silx/opencl/setup.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding: utf-8
-#
-# Copyright (C) 2016-2017 European Synchrotron Radiation Facility
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-#
-
-from __future__ import division
-
-__contact__ = "jerome.kieffer@esrf.eu"
-__license__ = "MIT"
-__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
-__authors__ = ["J. Kieffer"]
-__date__ = "16/10/2017"
-
-import os.path
-from numpy.distutils.misc_util import Configuration
-
-
-def configuration(parent_package='', top_path=None):
-    config = Configuration('opencl', parent_package, top_path)
-    path = os.path.dirname(os.path.abspath(__file__))
-    if os.path.exists(os.path.join(path, 'sift')):
-        config.add_subpackage('sift')
-    config.add_subpackage('codec')
-    config.add_subpackage('test')
-    return config
-
-
-if __name__ == "__main__":
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/src/silx/opencl/sinofilter.py b/src/silx/opencl/sinofilter.py
index d608744..fc447de 100644
--- a/src/silx/opencl/sinofilter.py
+++ b/src/silx/opencl/sinofilter.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
-# Copyright (c) 2016-2019 European Synchrotron Radiation Facility
+# Copyright (c) 2016-2023 European Synchrotron Radiation Facility
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -25,8 +24,6 @@
 # ###########################################################################*/
 """Module for sinogram filtering on CPU/GPU."""
 
-from __future__ import absolute_import, print_function, with_statement, division
-
 __authors__ = ["P. Paleo"]
 __license__ = "MIT"
 __date__ = "07/06/2019"
@@ -41,8 +38,6 @@ from .processing import OpenclProcessing
 from ..math.fft.clfft import CLFFT, __have_clfft__
 from ..math.fft.npfft import NPFFT
 from ..image.tomography import generate_powers, get_next_power, compute_fourier_filter
-from ..utils.deprecation import deprecated
-
 
 
 class SinoFilter(OpenclProcessing):
@@ -53,12 +48,21 @@ class SinoFilter(OpenclProcessing):
     - In 2D: (n_a, d_x): n_a filterings (1D FFT of size d_x)
     - In 3D: (n_z, n_a, d_x): n_z*n_a filterings (1D FFT of size d_x)
     """
+
     kernel_files = ["array_utils.cl"]
     powers = generate_powers()
 
-    def __init__(self, sino_shape, filter_name=None, ctx=None,
-                 devicetype="all", platformid=None, deviceid=None,
-                 profile=False, extra_options=None):
+    def __init__(
+        self,
+        sino_shape,
+        filter_name=None,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        profile=False,
+        extra_options=None,
+    ):
         """Constructor of OpenCL FFT-Convolve.
 
         :param sino_shape: shape of the sinogram.
@@ -75,9 +79,14 @@ class SinoFilter(OpenclProcessing):
         :param dict extra_options: Advanced extra options.
             Current options are: cutoff, use_numpy_fft
         """
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  profile=profile)
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            profile=profile,
+        )
 
         self._init_extra_options(extra_options)
         self._calculate_shapes(sino_shape)
@@ -95,8 +104,9 @@ class SinoFilter(OpenclProcessing):
         if self.ndim == 2:
             n_angles, dwidth = sino_shape
         else:
-            raise ValueError("Invalid sinogram number of dimensions: "
-                             "expected 2 dimensions")
+            raise ValueError(
+                "Invalid sinogram number of dimensions: " "expected 2 dimensions"
+            )
         self.sino_shape = sino_shape
         self.n_angles = n_angles
         self.dwidth = dwidth
@@ -113,14 +123,14 @@ class SinoFilter(OpenclProcessing):
             Current options are: cutoff,
         """
         self.extra_options = {
-            "cutoff": 1.,
+            "cutoff": 1.0,
             "use_numpy_fft": False,
         }
         if extra_options is not None:
             self.extra_options.update(extra_options)
 
     def _init_fft(self):
-        if __have_clfft__ and not(self.extra_options["use_numpy_fft"]):
+        if __have_clfft__ and not (self.extra_options["use_numpy_fft"]):
             self.fft_backend = "opencl"
             self.fft = CLFFT(
                 self.sino_padded_shape,
@@ -130,17 +140,22 @@ class SinoFilter(OpenclProcessing):
             )
         else:
             self.fft_backend = "numpy"
-            print("The gpyfft module was not found. The Fourier transforms "
-                  "will be done on CPU. For more performances, it is advised "
-                  "to install gpyfft.""")
+            print(
+                "The gpyfft module was not found. The Fourier transforms "
+                "will be done on CPU. For more performances, it is advised "
+                "to install gpyfft."
+                ""
+            )
             self.fft = NPFFT(
                 template=np.zeros(self.sino_padded_shape, "f"),
                 axes=(-1,),
             )
 
     def _allocate_memory(self):
-        self.d_filter_f = parray.zeros(self.queue, (self.sino_f_shape[-1],), np.complex64)
-        self.is_cpu = (self.device.type == "CPU")
+        self.d_filter_f = parray.zeros(
+            self.queue, (self.sino_f_shape[-1],), np.complex64
+        )
+        self.is_cpu = self.device.type == "CPU"
         # These are already allocated by FFT() if using the opencl backend
         if self.fft_backend == "opencl":
             self.d_sino_padded = self.fft.data_in
@@ -163,7 +178,9 @@ class SinoFilter(OpenclProcessing):
             self.dwidth_padded,
             self.filter_name,
             cutoff=self.extra_options["cutoff"],
-        )[:self.dwidth_padded // 2 + 1]  # R2C
+        )[
+            : self.dwidth_padded // 2 + 1
+        ]  # R2C
         self.set_filter(filter_f, normalize=True)
 
     def set_filter(self, h_filt, normalize=True):
@@ -184,7 +201,7 @@ class SinoFilter(OpenclProcessing):
                 """
                 % (self.sino_f_shape[-1], h_filt.size)
             )
-        if not(np.iscomplexobj(h_filt)):
+        if not (np.iscomplexobj(h_filt)):
             print("Warning: expected a complex Fourier filter")
         self.filter_f = h_filt
         if normalize:
@@ -195,24 +212,27 @@ class SinoFilter(OpenclProcessing):
     def _init_kernels(self):
         OpenclProcessing.compile_kernels(self, self.kernel_files)
         h, w = self.d_sino_f.shape
-        self.mult_kern_args = (self.queue, (int(w), (int(h))), None,
-                               self.d_sino_f.data,
-                               self.d_filter_f.data,
-                               np.int32(w),
-                               np.int32(h))
+        self.mult_kern_args = (
+            self.queue,
+            (int(w), (int(h))),
+            None,
+            self.d_sino_f.data,
+            self.d_filter_f.data,
+            np.int32(w),
+            np.int32(h),
+        )
 
     def check_array(self, arr):
         if arr.dtype != np.float32:
             raise ValueError("Expected data type = numpy.float32")
         if arr.shape != self.sino_shape:
-            raise ValueError("Expected sinogram shape %s, got %s" %
-                             (self.sino_shape, arr.shape))
-        if not(isinstance(arr, np.ndarray) or isinstance(arr, parray.Array)):
-            raise ValueError("Expected either numpy.ndarray or "
-                             "pyopencl.array.Array")
-
-    def copy2d(self, dst, src, transfer_shape, dst_offset=(0, 0),
-               src_offset=(0, 0)):
+            raise ValueError(
+                "Expected sinogram shape %s, got %s" % (self.sino_shape, arr.shape)
+            )
+        if not (isinstance(arr, np.ndarray) or isinstance(arr, parray.Array)):
+            raise ValueError("Expected either numpy.ndarray or " "pyopencl.array.Array")
+
+    def copy2d(self, dst, src, transfer_shape, dst_offset=(0, 0), src_offset=(0, 0)):
         """
 
         :param dst:
@@ -222,18 +242,23 @@ class SinoFilter(OpenclProcessing):
         :param src_offset:
         """
         shape = tuple(int(i) for i in transfer_shape[::-1])
-        ev = self.kernels.cpy2d(self.queue, shape, None,
-                                dst.data,
-                                src.data,
-                                np.int32(dst.shape[1]),
-                                np.int32(src.shape[1]),
-                                np.int32(dst_offset),
-                                np.int32(src_offset),
-                                np.int32(transfer_shape[::-1]))
+        ev = self.kernels.cpy2d(
+            self.queue,
+            shape,
+            None,
+            dst.data,
+            src.data,
+            np.int32(dst.shape[1]),
+            np.int32(src.shape[1]),
+            np.int32(dst_offset),
+            np.int32(src_offset),
+            np.int32(transfer_shape[::-1]),
+        )
         ev.wait()
 
-    def copy2d_host(self, dst, src, transfer_shape, dst_offset=(0, 0),
-                    src_offset=(0, 0)):
+    def copy2d_host(
+        self, dst, src, transfer_shape, dst_offset=(0, 0), src_offset=(0, 0)
+    ):
         """
 
         :param dst:
@@ -245,7 +270,9 @@ class SinoFilter(OpenclProcessing):
         s = transfer_shape
         do = dst_offset
         so = src_offset
-        dst[do[0]:do[0] + s[0], do[1]:do[1] + s[1]] = src[so[0]:so[0] + s[0], so[1]:so[1] + s[1]]
+        dst[do[0] : do[0] + s[0], do[1] : do[1] + s[1]] = src[
+            so[0] : so[0] + s[0], so[1] : so[1] + s[1]
+        ]
 
     def _prepare_input_sino(self, sino):
         """
@@ -272,7 +299,7 @@ class SinoFilter(OpenclProcessing):
                 self.d_sino_padded.finish()  # should not be required here
         else:
             # Numpy backend: FFT/mult/IFFT are done on host.
-            if not(isinstance(sino, np.ndarray)):
+            if not (isinstance(sino, np.ndarray)):
                 # Numpy backend + pyopencl input: need to copy D->H
                 self.tmp_sino_host[:] = sino[:]
                 h_sino_ref = self.tmp_sino_host
@@ -296,9 +323,11 @@ class SinoFilter(OpenclProcessing):
                 # As pyopencl does not support rectangular copies, we first have
                 # to call a kernel doing rectangular copy D->D, then do a copy
                 # D->H.
-                self.copy2d(dst=self.tmp_sino_device,
-                            src=self.d_sino_padded,
-                            transfer_shape=self.sino_shape)
+                self.copy2d(
+                    dst=self.tmp_sino_device,
+                    src=self.d_sino_padded,
+                    transfer_shape=self.sino_shape,
+                )
                 if self.is_cpu:
                     self.tmp_sino_device.finish()  # should not be required here
                 res[:] = self.tmp_sino_device.get()[:]
@@ -309,11 +338,13 @@ class SinoFilter(OpenclProcessing):
                 if self.is_cpu:
                     res.finish()  # should not be required here
         else:
-            if not(isinstance(res, np.ndarray)):
+            if not (isinstance(res, np.ndarray)):
                 # Numpy backend + pyopencl output: rect copy H->H + copy H->D
-                self.copy2d_host(dst=self.tmp_sino_host,
-                                 src=self.d_sino_padded,
-                                 transfer_shape=self.sino_shape)
+                self.copy2d_host(
+                    dst=self.tmp_sino_host,
+                    src=self.d_sino_padded,
+                    transfer_shape=self.sino_shape,
+                )
                 res[:] = self.tmp_sino_host[:]
             else:
                 # Numpy backend + numpy output: rect copy H->H
@@ -334,9 +365,7 @@ class SinoFilter(OpenclProcessing):
     def _multiply_fourier(self):
         if self.fft_backend == "opencl":
             # Everything is on device. Call the multiplication kernel.
-            ev = self.kernels.inplace_complex_mul_2Dby1D(
-                *self.mult_kern_args
-            )
+            ev = self.kernels.inplace_complex_mul_2Dby1D(*self.mult_kern_args)
             ev.wait()
             if self.is_cpu:
                 self.d_sino_f.finish()  # should not be required here
@@ -379,57 +408,3 @@ class SinoFilter(OpenclProcessing):
         # ~ return output
 
     __call__ = filter_sino
-
-
-
-
-# -------------------
-# - Compatibility  -
-# -------------------
-
-
-def nextpow2(N):
-    p = 1
-    while p < N:
-        p *= 2
-    return p
-
-
-@deprecated(replacement="Backprojection.sino_filter", since_version="0.10")
-def fourier_filter(sino, filter_=None, fft_size=None):
-    """Simple np based implementation of fourier space filter.
-    This function is deprecated, please use silx.opencl.sinofilter.SinoFilter.
-
-    :param sino: of shape shape = (num_projs, num_bins)
-    :param filter: filter function to apply in fourier space
-    :fft_size: size on which perform the fft. May be larger than the sino array
-    :return: filtered sinogram
-    """
-    assert sino.ndim == 2
-    num_projs, num_bins = sino.shape
-    if fft_size is None:
-        fft_size = nextpow2(num_bins * 2 - 1)
-    else:
-        assert fft_size >= num_bins
-    if fft_size == num_bins:
-        sino_zeropadded = sino.astype(np.float32)
-    else:
-        sino_zeropadded = np.zeros((num_projs, fft_size),
-                                      dtype=np.complex64)
-        sino_zeropadded[:, :num_bins] = sino.astype(np.float32)
-
-    if filter_ is None:
-        h = np.zeros(fft_size, dtype=np.float32)
-        L2 = fft_size // 2 + 1
-        h[0] = 1 / 4.
-        j = np.linspace(1, L2, L2 // 2, False)
-        h[1:L2:2] = -1. / (np.pi ** 2 * j ** 2)
-        h[L2:] = np.copy(h[1:L2 - 1][::-1])
-        filter_ = np.fft.fft(h).astype(np.complex64)
-
-    # Linear convolution
-    sino_f = np.fft.fft(sino, fft_size)
-    sino_f = sino_f * filter_
-    sino_filtered = np.fft.ifft(sino_f)[:, :num_bins].real
-
-    return np.ascontiguousarray(sino_filtered.real, dtype=np.float32)
diff --git a/src/silx/opencl/sparse.py b/src/silx/opencl/sparse.py
index 514589a..9baa3a0 100644
--- a/src/silx/opencl/sparse.py
+++ b/src/silx/opencl/sparse.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
 # Copyright (c) 2019 European Synchrotron Radiation Facility
@@ -25,8 +24,6 @@
 # ###########################################################################*/
 """Module for data sparsification on CPU/GPU."""
 
-from __future__ import absolute_import, print_function, with_statement, division
-
 __authors__ = ["P. Paleo"]
 __license__ = "MIT"
 __date__ = "07/06/2019"
@@ -38,11 +35,13 @@ from pyopencl.scan import GenericScanKernel
 from pyopencl.tools import dtype_to_ctype
 from .common import pyopencl as cl
 from .processing import OpenclProcessing, EventDescription, BufferDescription
+
 mf = cl.mem_flags
 
 
 CSRData = namedtuple("CSRData", ["data", "indices", "indptr"])
 
+
 def tuple_to_csrdata(arrs):
     """
     Converts a 3-tuple to a CSRData namedtuple.
@@ -52,13 +51,23 @@ def tuple_to_csrdata(arrs):
     return CSRData(data=arrs[0], indices=arrs[1], indptr=arrs[2])
 
 
-
 class CSR(OpenclProcessing):
     kernel_files = ["sparse.cl"]
 
-    def __init__(self, shape, dtype="f", max_nnz=None, idx_dtype=numpy.int32,
-                 ctx=None, devicetype="all", platformid=None, deviceid=None,
-                 block_size=None, memory=None, profile=False):
+    def __init__(
+        self,
+        shape,
+        dtype="f",
+        max_nnz=None,
+        idx_dtype=numpy.int32,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        block_size=None,
+        memory=None,
+        profile=False,
+    ):
         """
         Compute Compressed Sparse Row format of an image (2D matrix).
         It is designed to be compatible with scipy.sparse.csr_matrix.
@@ -80,10 +89,16 @@ class CSR(OpenclProcessing):
         for information on the other parameters.
         """
 
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  block_size=block_size, memory=memory,
-                                  profile=profile)
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            block_size=block_size,
+            memory=memory,
+            profile=profile,
+        )
         self._set_parameters(shape, dtype, max_nnz, idx_dtype)
         self._allocate_memory()
         self._setup_kernels()
@@ -96,23 +111,23 @@ class CSR(OpenclProcessing):
         self.shape = shape
         self.size = numpy.prod(shape)
         self._set_idx_dtype(idx_dtype)
-        assert len(shape) == 2 #
+        assert len(shape) == 2  #
         if max_nnz is None:
-            self.max_nnz = numpy.prod(shape) # worst case
+            self.max_nnz = numpy.prod(shape)  # worst case
         else:
             self.max_nnz = int(max_nnz)
         self._set_dtype(dtype)
 
-
     def _set_idx_dtype(self, idx_dtype):
         idx_dtype = numpy.dtype(idx_dtype)
         if idx_dtype.kind not in ["i", "u"]:
             raise ValueError("Not an integer type: %s" % idx_dtype)
         # scan value type must have size divisible by 4 bytes
         if idx_dtype.itemsize % 4 != 0:
-            raise ValueError("Due to an internal pyopencl limitation, idx_dtype type must have size divisible by 4 bytes")
-        self.indice_dtype = idx_dtype #
-
+            raise ValueError(
+                "Due to an internal pyopencl limitation, idx_dtype type must have size divisible by 4 bytes"
+            )
+        self.indice_dtype = idx_dtype  #
 
     def _set_dtype(self, dtype):
         self.dtype = numpy.dtype(dtype)
@@ -122,42 +137,44 @@ class CSR(OpenclProcessing):
             self._c_zero_str = "0.0f"
         elif self.dtype == numpy.dtype(numpy.float64):
             self._c_zero_str = "0.0"
-        else: # assuming integer
+        else:  # assuming integer
             self._c_zero_str = "0"
         self.c_dtype = dtype_to_ctype(self.dtype)
         self.idx_c_dtype = dtype_to_ctype(self.indice_dtype)
 
-
     def _allocate_memory(self):
-        self.is_cpu = (self.device.type == "CPU") # move to OpenclProcessing ?
+        self.is_cpu = self.device.type == "CPU"  # move to OpenclProcessing ?
         self.buffers = [
             BufferDescription("array", (self.size,), self.dtype, mf.READ_ONLY),
             BufferDescription("data", (self.max_nnz,), self.dtype, mf.READ_WRITE),
-            BufferDescription("indices", (self.max_nnz,), self.indice_dtype, mf.READ_WRITE),
-            BufferDescription("indptr", (self.shape[0]+1,), self.indice_dtype, mf.READ_WRITE),
+            BufferDescription(
+                "indices", (self.max_nnz,), self.indice_dtype, mf.READ_WRITE
+            ),
+            BufferDescription(
+                "indptr", (self.shape[0] + 1,), self.indice_dtype, mf.READ_WRITE
+            ),
         ]
         self.allocate_buffers(use_array=True)
         for arr_name in ["array", "data", "indices", "indptr"]:
             setattr(self, arr_name, self.cl_mem[arr_name])
-            self.cl_mem[arr_name].fill(0) # allocate_buffers() uses empty()
+            self.cl_mem[arr_name].fill(0)  # allocate_buffers() uses empty()
         self._old_array = self.array
         self._old_data = self.data
         self._old_indices = self.indices
         self._old_indptr = self.indptr
 
-
     def _setup_kernels(self):
         self._setup_compaction_kernel()
         self._setup_decompaction_kernel()
 
-
     def _setup_compaction_kernel(self):
         kernel_signature = str(
             "__global %s *data, \
             __global %s *data_compacted, \
             __global %s *indices, \
             __global %s* indptr \
-            """ % (self.c_dtype, self.c_dtype, self.idx_c_dtype, self.idx_c_dtype)
+            "
+            "" % (self.c_dtype, self.c_dtype, self.idx_c_dtype, self.idx_c_dtype)
         )
         if self.dtype.kind == "f":
             map_nonzero_expr = "(fabs(data[i]) > %s) ? 1 : 0" % self._c_zero_str
@@ -167,10 +184,12 @@ class CSR(OpenclProcessing):
             raise ValueError("Unknown data type")
 
         self.scan_kernel = GenericScanKernel(
-            self.ctx, self.indice_dtype,
+            self.ctx,
+            self.indice_dtype,
             arguments=kernel_signature,
             input_expr=map_nonzero_expr,
-            scan_expr="a+b", neutral="0",
+            scan_expr="a+b",
+            neutral="0",
             output_statement="""
                 // item is the running sum of input_expr(i), i.e the cumsum of "nonzero"
                 if (prev_item != item) {
@@ -186,7 +205,6 @@ class CSR(OpenclProcessing):
             preamble="#define GET_INDEX(i) (i % IMAGE_WIDTH)",
         )
 
-
     def _setup_decompaction_kernel(self):
         OpenclProcessing.compile_kernels(
             self,
@@ -195,18 +213,17 @@ class CSR(OpenclProcessing):
                 "-DIMAGE_WIDTH=%d" % self.shape[1],
                 "-DDTYPE=%s" % self.c_dtype,
                 "-DIDX_DTYPE=%s" % self.idx_c_dtype,
-            ]
+            ],
         )
         device = self.ctx.devices[0]
         wg_x = min(
             device.max_work_group_size,
             32,
-            self.kernels.max_workgroup_size("densify_csr")
+            self.kernels.max_workgroup_size("densify_csr"),
         )
         self._decomp_wg = (wg_x, 1)
         self._decomp_grid = (self._decomp_wg[0], self.shape[0])
 
-
     # --------------------------------------------------------------------------
     # -------------------------- Array utils -----------------------------------
     # --------------------------------------------------------------------------
@@ -222,7 +239,6 @@ class CSR(OpenclProcessing):
         assert arr.size == self.size
         assert arr.dtype == self.dtype
 
-
     # TODO handle pyopencl Buffer
     def check_sparse_arrays(self, csr_data):
         """
@@ -237,12 +253,11 @@ class CSR(OpenclProcessing):
             assert arr.ndim == 1
         assert csr_data.data.size <= self.max_nnz
         assert csr_data.indices.size <= self.max_nnz
-        assert csr_data.indptr.size == self.shape[0]+1
+        assert csr_data.indptr.size == self.shape[0] + 1
         assert csr_data.data.dtype == self.dtype
         assert csr_data.indices.dtype == self.indice_dtype
         assert csr_data.indptr.dtype == self.indice_dtype
 
-
     def set_array(self, arr):
         """
         Set the provided array as the current context 2D matrix.
@@ -262,23 +277,25 @@ class CSR(OpenclProcessing):
         else:
             raise ValueError("Expected pyopencl array or numpy array")
 
-
     def set_sparse_arrays(self, csr_data):
         if csr_data is None:
             return
         self.check_sparse_arrays(csr_data)
-        for name, arr in {"data": csr_data.data, "indices": csr_data.indices, "indptr": csr_data.indptr}.items():
+        for name, arr in {
+            "data": csr_data.data,
+            "indices": csr_data.indices,
+            "indptr": csr_data.indptr,
+        }.items():
             # The current array is a device array. Don't copy, use it directly
             if isinstance(arr, parray.Array):
                 setattr(self, "_old_" + name, getattr(self, name))
                 setattr(self, name, arr)
             # The current array is a numpy.ndarray: copy H2D
             elif isinstance(arr, numpy.ndarray):
-                getattr(self, name)[:arr.size] = arr[:]
+                getattr(self, name)[: arr.size] = arr[:]
             else:
                 raise ValueError("Unsupported array type: %s" % type(arr))
 
-
     def _recover_arrays_references(self):
         """
         Recover the previous arrays references, and return the references of the
@@ -293,7 +310,6 @@ class CSR(OpenclProcessing):
             setattr(self, name, getattr(self, "_old_" + name))
         return array, (data, indices, indptr)
 
-
     def get_sparse_arrays(self, output):
         """
         Get the 2D dense array of the current context.
@@ -314,7 +330,6 @@ class CSR(OpenclProcessing):
             res = output
         return res
 
-
     def get_array(self, output):
         if output is None:
             res = self.array.get().reshape(self.shape)
@@ -344,7 +359,7 @@ class CSR(OpenclProcessing):
             self.indices,
             self.indptr,
         )
-        #~ evt.wait()
+        # ~ evt.wait()
         self.profile_add(evt, "sparsification kernel")
         res = self.get_sparse_arrays(output)
         self._recover_arrays_references()
@@ -355,9 +370,7 @@ class CSR(OpenclProcessing):
     # --------------------------------------------------------------------------
 
     def densify(self, data, indices, indptr, output=None):
-        self.set_sparse_arrays(
-            CSRData(data=data, indices=indices, indptr=indptr)
-        )
+        self.set_sparse_arrays(CSRData(data=data, indices=indices, indptr=indptr))
         self.set_array(output)
         evt = self.kernels.densify_csr(
             self.queue,
@@ -369,9 +382,8 @@ class CSR(OpenclProcessing):
             self.array.data,
             numpy.int32(self.shape[0]),
         )
-        #~ evt.wait()
+        # ~ evt.wait()
         self.profile_add(evt, "desparsification kernel")
         res = self.get_array(output)
         self._recover_arrays_references()
         return res
-
diff --git a/src/silx/opencl/statistics.py b/src/silx/opencl/statistics.py
index a96ee33..26d23e6 100644
--- a/src/silx/opencl/statistics.py
+++ b/src/silx/opencl/statistics.py
@@ -1,9 +1,8 @@
-# -*- coding: utf-8 -*-
 #
 #    Project: SILX
 #             https://github.com/silx-kit/silx
 #
-#    Copyright (C) 2012-2019 European Synchrotron Radiation Facility, Grenoble, France
+#    Copyright (C) 2012-2023 European Synchrotron Radiation Facility, Grenoble, France
 #
 #    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
 #
@@ -37,7 +36,7 @@ __contact__ = "jerome.kieffer@esrf.fr"
 
 import logging
 import numpy
-from collections import OrderedDict, namedtuple
+from collections import namedtuple
 from math import sqrt
 
 from .common import pyopencl
@@ -47,6 +46,7 @@ from .utils import concatenate_cl_kernel
 if pyopencl:
     mf = pyopencl.mem_flags
     from pyopencl.reduction import ReductionKernel
+
     try:
         from pyopencl import cltypes
     except ImportError:
@@ -59,8 +59,9 @@ else:
     raise ImportError("pyopencl is not installed")
 logger = logging.getLogger(__name__)
 
-StatResults = namedtuple("StatResults", ["min", "max", "cnt", "sum", "mean",
-                                         "var", "std"])
+StatResults = namedtuple(
+    "StatResults", ["min", "max", "cnt", "sum", "mean", "var", "std"]
+)
 zero8 = "(float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)"
 #                    min      max    cnt  cnt_e  sum   sum_e  var  var_e
 
@@ -82,33 +83,52 @@ class Statistics(OpenclProcessing):
         Switch on profiling to be able to profile at the kernel level,
         store profiling elements (makes code slightly slower)
     """
+
     buffers = [
         BufferDescription("raw", 1, numpy.float32, mf.READ_ONLY),
         BufferDescription("converted", 1, numpy.float32, mf.READ_WRITE),
     ]
     kernel_files = ["preprocess.cl"]
-    mapping = {numpy.int8: "s8_to_float",
-               numpy.uint8: "u8_to_float",
-               numpy.int16: "s16_to_float",
-               numpy.uint16: "u16_to_float",
-               numpy.uint32: "u32_to_float",
-               numpy.int32: "s32_to_float"}
-
-    def __init__(self, size=None, dtype=None, template=None,
-                 ctx=None, devicetype="all", platformid=None, deviceid=None,
-                 block_size=None, profile=False
-                 ):
-        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
-                                  platformid=platformid, deviceid=deviceid,
-                                  block_size=block_size, profile=profile)
+    mapping = {
+        numpy.int8: "s8_to_float",
+        numpy.uint8: "u8_to_float",
+        numpy.int16: "s16_to_float",
+        numpy.uint16: "u16_to_float",
+        numpy.uint32: "u32_to_float",
+        numpy.int32: "s32_to_float",
+    }
+
+    def __init__(
+        self,
+        size=None,
+        dtype=None,
+        template=None,
+        ctx=None,
+        devicetype="all",
+        platformid=None,
+        deviceid=None,
+        block_size=None,
+        profile=False,
+    ):
+        OpenclProcessing.__init__(
+            self,
+            ctx=ctx,
+            devicetype=devicetype,
+            platformid=platformid,
+            deviceid=deviceid,
+            block_size=block_size,
+            profile=profile,
+        )
         self.size = size
         self.dtype = dtype
         if template is not None:
             self.size = template.size
             self.dtype = template.dtype
 
-        self.buffers = [BufferDescription(i.name, i.size * self.size, i.dtype, i.flags)
-                        for i in self.__class__.buffers]
+        self.buffers = [
+            BufferDescription(i.name, i.size * self.size, i.dtype, i.flags)
+            for i in self.__class__.buffers
+        ]
 
         self.allocate_buffers(use_array=True)
         self.compile_kernels()
@@ -117,43 +137,54 @@ class Statistics(OpenclProcessing):
     def set_kernel_arguments(self):
         """Parametrize all kernel arguments"""
         for val in self.mapping.values():
-            self.cl_kernel_args[val] = OrderedDict(((i, self.cl_mem[i]) for i in ("raw", "converted")))
+            self.cl_kernel_args[val] = dict(
+                ((i, self.cl_mem[i]) for i in ("raw", "converted"))
+            )
 
     def compile_kernels(self):
         """Compile the kernel"""
-        OpenclProcessing.compile_kernels(self,
-                                         self.kernel_files,
-                                         "-D NIMAGE=%i" % self.size)
+        OpenclProcessing.compile_kernels(
+            self, self.kernel_files, "-D NIMAGE=%i" % self.size
+        )
         compiler_options = self.get_compiler_options(x87_volatile=True)
         src = concatenate_cl_kernel(("doubleword.cl", "statistics.cl"))
-        self.reduction_comp = ReductionKernel(self.ctx,
-                                              dtype_out=float8,
-                                              neutral=zero8,
-                                              map_expr="map_statistics(data, i)",
-                                              reduce_expr="reduce_statistics(a,b)",
-                                              arguments="__global float *data",
-                                              preamble=src,
-                                              options=compiler_options)
-        self.reduction_simple = ReductionKernel(self.ctx,
-                                                dtype_out=float8,
-                                                neutral=zero8,
-                                                map_expr="map_statistics(data, i)",
-                                                reduce_expr="reduce_statistics_simple(a,b)",
-                                                arguments="__global float *data",
-                                                preamble=src,
-                                                options=compiler_options)
+        self.reduction_comp = ReductionKernel(
+            self.ctx,
+            dtype_out=float8,
+            neutral=zero8,
+            map_expr="map_statistics(data, i)",
+            reduce_expr="reduce_statistics(a,b)",
+            arguments="__global float *data",
+            preamble=src,
+            options=compiler_options,
+        )
+        self.reduction_simple = ReductionKernel(
+            self.ctx,
+            dtype_out=float8,
+            neutral=zero8,
+            map_expr="map_statistics(data, i)",
+            reduce_expr="reduce_statistics_simple(a,b)",
+            arguments="__global float *data",
+            preamble=src,
+            options=compiler_options,
+        )
 
         if "cl_khr_fp64" in self.device.extensions:
-            self.reduction_double = ReductionKernel(self.ctx,
-                                                    dtype_out=float8,
-                                                    neutral=zero8,
-                                                    map_expr="map_statistics(data, i)",
-                                                    reduce_expr="reduce_statistics_double(a,b)",
-                                                    arguments="__global float *data",
-                                                    preamble=src,
-                                                    options=compiler_options)
+            self.reduction_double = ReductionKernel(
+                self.ctx,
+                dtype_out=float8,
+                neutral=zero8,
+                map_expr="map_statistics(data, i)",
+                reduce_expr="reduce_statistics_double(a,b)",
+                arguments="__global float *data",
+                preamble=src,
+                options=compiler_options,
+            )
         else:
-            logger.info("Device %s does not support double-precision arithmetics, fall-back on compensated one", self.device)
+            logger.info(
+                "Device %s does not support double-precision arithmetics, fall-back on compensated one",
+                self.device,
+            )
             self.reduction_double = self.reduction_comp
 
     def send_buffer(self, data, dest):
@@ -168,23 +199,27 @@ class Statistics(OpenclProcessing):
         dest_type = numpy.dtype([i.dtype for i in self.buffers if i.name == dest][0])
         events = []
         if (data.dtype == dest_type) or (data.dtype.itemsize > dest_type.itemsize):
-            copy_image = pyopencl.enqueue_copy(self.queue,
-                                               self.cl_mem[dest].data,
-                                               numpy.ascontiguousarray(data, dest_type))
+            copy_image = pyopencl.enqueue_copy(
+                self.queue,
+                self.cl_mem[dest].data,
+                numpy.ascontiguousarray(data, dest_type),
+            )
             events.append(EventDescription("copy H->D %s" % dest, copy_image))
         else:
-            copy_image = pyopencl.enqueue_copy(self.queue,
-                                               self.cl_mem["raw"].data,
-                                               numpy.ascontiguousarray(data))
+            copy_image = pyopencl.enqueue_copy(
+                self.queue, self.cl_mem["raw"].data, numpy.ascontiguousarray(data)
+            )
             kernel = getattr(self.program, self.mapping[data.dtype.type])
-            cast_to_float = kernel(self.queue,
-                                   (self.size,),
-                                   None,
-                                   self.cl_mem["raw"].data,
-                                   self.cl_mem[dest].data)
+            cast_to_float = kernel(
+                self.queue,
+                (self.size,),
+                None,
+                self.cl_mem["raw"].data,
+                self.cl_mem[dest].data,
+            )
             events += [
                 EventDescription("copy H->D raw", copy_image),
-                EventDescription(f"cast to float {dest}", cast_to_float)
+                EventDescription(f"cast to float {dest}", cast_to_float),
             ]
         if self.profile:
             self.events += events
@@ -194,7 +229,7 @@ class Statistics(OpenclProcessing):
         """Actually calculate the statics on the data
 
         :param numpy.ndarray data: numpy array with the image
-        :param comp: use Kahan compensated arithmetics for the calculation 
+        :param comp: use Kahan compensated arithmetics for the calculation
         :return: Statistics named tuple
         :rtype: StatResults
         """
@@ -217,9 +252,11 @@ class Statistics(OpenclProcessing):
                 reduction = self.reduction_double
             else:
                 reduction = self.reduction_comp
-            res_d, evt = reduction(self.cl_mem["converted"][:self.size],
-                                   queue=self.queue,
-                                   return_event=True)
+            res_d, evt = reduction(
+                self.cl_mem["converted"][: self.size],
+                queue=self.queue,
+                return_event=True,
+            )
             events.append(EventDescription(f"statistical reduction {comp}", evt))
             if self.profile:
                 self.events += events
@@ -230,13 +267,7 @@ class Statistics(OpenclProcessing):
         sum_ = 1.0 * res_h["s4"] + res_h["s5"]
         m2 = 1.0 * res_h["s6"] + res_h["s7"]
         var = m2 / (count - 1.0)
-        res = StatResults(min_,
-                          max_,
-                          count,
-                          sum_,
-                          sum_ / count,
-                          var,
-                          sqrt(var))
+        res = StatResults(min_, max_, count, sum_, sum_ / count, var, sqrt(var))
         return res
 
     __call__ = process
diff --git a/src/silx/opencl/test/__init__.py b/src/silx/opencl/test/__init__.py
index 92cda4a..b1ecf1b 100644
--- a/src/silx/opencl/test/__init__.py
+++ b/src/silx/opencl/test/__init__.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 #
 #    Project: silx
 #             https://github.com/silx-kit/silx
diff --git a/src/silx/opencl/test/test_addition.py b/src/silx/opencl/test/test_addition.py
index 3b668bf..98beab4 100644
--- a/src/silx/opencl/test/test_addition.py
+++ b/src/silx/opencl/test/test_addition.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 #
 #    Project: Sift implementation in Python + OpenCL
 #             https://github.com/silx-kit/silx
@@ -41,16 +40,17 @@ import pytest
 
 import unittest
 from ..common import ocl, _measure_workgroup_size, query_kernel_info
+
 if ocl:
     import pyopencl
     import pyopencl.array
 from ..utils import get_opencl_code
+
 logger = logging.getLogger(__name__)
 
 
 @unittest.skipUnless(ocl, "PyOpenCl is missing")
 class TestAddition(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         super(TestAddition, cls).setUpClass()
@@ -59,8 +59,9 @@ class TestAddition(unittest.TestCase):
             if logger.getEffectiveLevel() <= logging.INFO:
                 cls.PROFILE = True
                 cls.queue = pyopencl.CommandQueue(
-                                cls.ctx,
-                                properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+                    cls.ctx,
+                    properties=pyopencl.command_queue_properties.PROFILING_ENABLE,
+                )
             else:
                 cls.PROFILE = False
                 cls.queue = pyopencl.CommandQueue(cls.ctx)
@@ -69,7 +70,10 @@ class TestAddition(unittest.TestCase):
     @classmethod
     def tearDownClass(cls):
         super(TestAddition, cls).tearDownClass()
-        print("Maximum valid workgroup size %s on device %s" % (cls.max_valid_wg, cls.ctx.devices[0]))
+        print(
+            "Maximum valid workgroup size %s on device %s"
+            % (cls.max_valid_wg, cls.ctx.devices[0])
+        )
         cls.ctx = None
         cls.queue = None
 
@@ -96,11 +100,20 @@ class TestAddition(unittest.TestCase):
             d_array_result = pyopencl.array.empty_like(self.d_array_img)
             wg = 1 << i
             try:
-                evt = self.program.addition(self.queue, (self.shape,), (wg,),
-                       self.d_array_img.data, self.d_array_5.data, d_array_result.data, numpy.int32(self.shape))
+                evt = self.program.addition(
+                    self.queue,
+                    (self.shape,),
+                    (wg,),
+                    self.d_array_img.data,
+                    self.d_array_5.data,
+                    d_array_result.data,
+                    numpy.int32(self.shape),
+                )
                 evt.wait()
             except Exception as error:
-                max_valid_wg = self.program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, self.ctx.devices[0])
+                max_valid_wg = self.program.addition.get_work_group_info(
+                    pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, self.ctx.devices[0]
+                )
                 msg = "Error %s on WG=%s: %s" % (error, wg, max_valid_wg)
                 self.assertLess(max_valid_wg, wg, msg)
                 break
@@ -118,23 +131,39 @@ class TestAddition(unittest.TestCase):
         for platform in ocl.platforms:
             for did, device in enumerate(platform.devices):
                 meas = _measure_workgroup_size((platform.id, device.id))
-                self.assertEqual(meas, device.max_work_group_size,
-                                 "Workgroup size for %s/%s: %s == %s" % (platform, device, meas, device.max_work_group_size))
+                self.assertEqual(
+                    meas,
+                    device.max_work_group_size,
+                    "Workgroup size for %s/%s: %s == %s"
+                    % (platform, device, meas, device.max_work_group_size),
+                )
 
     def test_query(self):
         """
         tests that all devices are working properly ... lengthy and error prone
         """
-        for what in ("COMPILE_WORK_GROUP_SIZE",
-                     "LOCAL_MEM_SIZE",
-                     "PREFERRED_WORK_GROUP_SIZE_MULTIPLE",
-                     "PRIVATE_MEM_SIZE",
-                     "WORK_GROUP_SIZE"):
-            logger.info("%s: %s", what, query_kernel_info(program=self.program, kernel="addition", what=what))
-
-        # Not all ICD work properly ....    
-        #self.assertEqual(3, len(query_kernel_info(program=self.program, kernel="addition", what="COMPILE_WORK_GROUP_SIZE")), "3D kernel")
-
-        min_wg = query_kernel_info(program=self.program, kernel="addition", what="PREFERRED_WORK_GROUP_SIZE_MULTIPLE")
-        max_wg = query_kernel_info(program=self.program, kernel="addition", what="WORK_GROUP_SIZE")
+        for what in (
+            "COMPILE_WORK_GROUP_SIZE",
+            "LOCAL_MEM_SIZE",
+            "PREFERRED_WORK_GROUP_SIZE_MULTIPLE",
+            "PRIVATE_MEM_SIZE",
+            "WORK_GROUP_SIZE",
+        ):
+            logger.info(
+                "%s: %s",
+                what,
+                query_kernel_info(program=self.program, kernel="addition", what=what),
+            )
+
+        # Not all ICD work properly ....
+        # self.assertEqual(3, len(query_kernel_info(program=self.program, kernel="addition", what="COMPILE_WORK_GROUP_SIZE")), "3D kernel")
+
+        min_wg = query_kernel_info(
+            program=self.program,
+            kernel="addition",
+            what="PREFERRED_WORK_GROUP_SIZE_MULTIPLE",
+        )
+        max_wg = query_kernel_info(
+            program=self.program, kernel="addition", what="WORK_GROUP_SIZE"
+        )
         self.assertEqual(max_wg % min_wg, 0, msg="max_wg is a multiple of min_wg")
diff --git a/src/silx/opencl/test/test_array_utils.py b/src/silx/opencl/test/test_array_utils.py
index 325a6c3..98f8bf3 100644
--- a/src/silx/opencl/test/test_array_utils.py
+++ b/src/silx/opencl/test/test_array_utils.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
-# Copyright (c) 2016 European Synchrotron Radiation Facility
+# Copyright (c) 2016-2022 European Synchrotron Radiation Facility
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -25,42 +24,32 @@
 # ###########################################################################*/
 """Test of the OpenCL array_utils"""
 
-from __future__ import division, print_function
-
 __authors__ = ["Pierre paleo"]
 __license__ = "MIT"
 __copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
 __date__ = "14/06/2017"
 
 
-import time
 import logging
 import numpy as np
 import unittest
+
 try:
     import mako
 except ImportError:
     mako = None
 from ..common import ocl
+
 if ocl:
     import pyopencl as cl
     import pyopencl.array as parray
-    from .. import linalg
 from ..utils import get_opencl_code
-from silx.test.utils import utilstest
 
 logger = logging.getLogger(__name__)
-try:
-    from scipy.ndimage.filters import laplace
-    _has_scipy = True
-except ImportError:
-    _has_scipy = False
-
 
 
 @unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
 class TestCpy2d(unittest.TestCase):
-
     def setUp(self):
         if ocl is None:
             return
@@ -68,8 +57,8 @@ class TestCpy2d(unittest.TestCase):
         if logger.getEffectiveLevel() <= logging.INFO:
             self.PROFILE = True
             self.queue = cl.CommandQueue(
-                            self.ctx,
-                            properties=cl.command_queue_properties.PROFILING_ENABLE)
+                self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
+            )
         else:
             self.PROFILE = False
             self.queue = cl.CommandQueue(self.ctx)
@@ -96,8 +85,12 @@ class TestCpy2d(unittest.TestCase):
         self.offset1 = (offset1_y, offset1_x)
         self.offset2 = (offset2_y, offset2_x)
         # Compute the size of the rectangle to transfer
-        size_y = np.random.randint(2, high=min(self.shape1[0], self.shape2[0]) - max(offset1_y, offset2_y) + 1)
-        size_x = np.random.randint(2, high=min(self.shape1[1], self.shape2[1]) - max(offset1_x, offset2_x) + 1)
+        size_y = np.random.randint(
+            2, high=min(self.shape1[0], self.shape2[0]) - max(offset1_y, offset2_y) + 1
+        )
+        size_x = np.random.randint(
+            2, high=min(self.shape1[1], self.shape2[1]) - max(offset1_x, offset2_x) + 1
+        )
         self.transfer_shape = (size_y, size_x)
 
     def tearDown(self):
@@ -113,7 +106,9 @@ class TestCpy2d(unittest.TestCase):
     def compare(self, result, reference):
         errmax = np.max(np.abs(result - reference))
         logger.info("Max error = %e" % (errmax))
-        self.assertTrue(errmax == 0, str("Max error is too high"))#. PRNG state was %s" % str(self.prng_state)))
+        self.assertTrue(
+            errmax == 0, str("Max error is too high")
+        )  # . PRNG state was %s" % str(self.prng_state)))
 
     @unittest.skipUnless(ocl and mako, "pyopencl is missing")
     def test_cpy2d(self):
@@ -124,18 +119,26 @@ class TestCpy2d(unittest.TestCase):
         o1 = self.offset1
         o2 = self.offset2
         T = self.transfer_shape
-        logger.info("""Testing D->D rectangular copy with (N1_y, N1_x) = %s,
+        logger.info(
+            """Testing D->D rectangular copy with (N1_y, N1_x) = %s,
                     (N2_y, N2_x) = %s:
-                    array2[%d:%d, %d:%d] = array1[%d:%d, %d:%d]""" %
-                        (
-                            str(self.shape1), str(self.shape2),
-                            o2[0], o2[0] + T[0],
-                            o2[1], o2[1] + T[1],
-                            o1[0], o1[0] + T[0],
-                            o1[1], o1[1] + T[1]
-                        )
-                    )
-        self.array2[o2[0]:o2[0] + T[0], o2[1]:o2[1] + T[1]] = self.array1[o1[0]:o1[0] + T[0], o1[1]:o1[1] + T[1]]
+                    array2[%d:%d, %d:%d] = array1[%d:%d, %d:%d]"""
+            % (
+                str(self.shape1),
+                str(self.shape2),
+                o2[0],
+                o2[0] + T[0],
+                o2[1],
+                o2[1] + T[1],
+                o1[0],
+                o1[0] + T[0],
+                o1[1],
+                o1[1] + T[1],
+            )
+        )
+        self.array2[o2[0] : o2[0] + T[0], o2[1] : o2[1] + T[1]] = self.array1[
+            o1[0] : o1[0] + T[0], o1[1] : o1[1] + T[1]
+        ]
         kernel_args = (
             self.d_array2.data,
             self.d_array1.data,
@@ -143,7 +146,7 @@ class TestCpy2d(unittest.TestCase):
             np.int32(self.shape1[1]),
             np.int32(self.offset2[::-1]),
             np.int32(self.offset1[::-1]),
-            np.int32(self.transfer_shape[::-1])
+            np.int32(self.transfer_shape[::-1]),
         )
         wg = None
         ndrange = self.transfer_shape[::-1]
diff --git a/src/silx/opencl/test/test_backprojection.py b/src/silx/opencl/test/test_backprojection.py
index 96d56fa..b08c972 100644
--- a/src/silx/opencl/test/test_backprojection.py
+++ b/src/silx/opencl/test/test_backprojection.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
 # Copyright (c) 2016 European Synchrotron Radiation Facility
@@ -25,8 +24,6 @@
 # ###########################################################################*/
 """Test of the filtered backprojection module"""
 
-from __future__ import division, print_function
-
 __authors__ = ["Pierre paleo"]
 __license__ = "MIT"
 __copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
@@ -38,11 +35,13 @@ import logging
 import numpy as np
 import unittest
 from math import pi
+
 try:
     import mako
 except ImportError:
     mako = None
 from ..common import ocl
+
 if ocl:
     from .. import backprojection
     from ...image.tomography import compute_fourier_filter
@@ -59,7 +58,7 @@ def generate_coords(img_shp, center=None):
     l_r, l_c = float(img_shp[0]), float(img_shp[1])
     R, C = np.mgrid[:l_r, :l_c]
     if center is None:
-        center0, center1 = l_r / 2., l_c / 2.
+        center0, center1 = l_r / 2.0, l_c / 2.0
     else:
         center0, center1 = center
     R = R + 0.5 - center0
@@ -75,7 +74,7 @@ def clip_circle(img, center=None, radius=None):
     M = R * R + C * C
     res = np.zeros_like(img)
     if radius is None:
-        radius = img.shape[0] / 2. - 1
+        radius = img.shape[0] / 2.0 - 1
     mask = M < radius * radius
     res[mask] = img[mask]
     return res
@@ -83,20 +82,21 @@ def clip_circle(img, center=None, radius=None):
 
 @unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
 class TestFBP(unittest.TestCase):
-
     def setUp(self):
         if ocl is None:
             return
         self.getfiles()
         self.fbp = backprojection.Backprojection(self.sino.shape, profile=True)
         if self.fbp.compiletime_workgroup_size < 16 * 16:
-            self.skipTest("Current implementation of OpenCL backprojection is "
-                          "not supported on this platform yet")
+            self.skipTest(
+                "Current implementation of OpenCL backprojection is "
+                "not supported on this platform yet"
+            )
         # Astra does not use the same backprojector implementation.
         # Therefore, we cannot expect results to be the "same" (up to float32
         # numerical error)
         self.tol = 5e-2
-        if not(self.fbp._use_textures) or self.fbp.device.type == "CPU":
+        if not (self.fbp._use_textures) or self.fbp.device.type == "CPU":
             # Precision is less when using CPU
             # (either CPU textures or "manual" linear interpolation)
             self.tol *= 2
@@ -133,8 +133,12 @@ class TestFBP(unittest.TestCase):
         ref_clipped = clip_circle(self.reference_rec)
         delta = abs(res_clipped - ref_clipped)
         bad = delta > 1
-        logger.debug("Absolute difference: %s with %s outlier pixels out of %s"
-                     "", delta.max(), bad.sum(), np.prod(bad.shape))
+        logger.debug(
+            "Absolute difference: %s with %s outlier pixels out of %s" "",
+            delta.max(),
+            bad.sum(),
+            np.prod(bad.shape),
+        )
         return delta.max()
 
     @unittest.skipUnless(ocl and mako, "pyopencl is missing")
@@ -160,17 +164,14 @@ class TestFBP(unittest.TestCase):
         for i in range(10):
             res = self.fbp.filtered_backprojection(self.sino)
             errmax = np.max(np.abs(res - res0))
-            self.assertTrue(errmax < 1.e-6, "Max error is too high")
+            self.assertTrue(errmax < 1.0e-6, "Max error is too high")
 
     @unittest.skipUnless(ocl and mako, "pyopencl is missing")
     def test_fbp_filters(self):
         """
         Test the different available filters of silx FBP.
         """
-        avail_filters = [
-            "ramlak", "shepp-logan", "cosine", "hamming",
-            "hann"
-        ]
+        avail_filters = ["ramlak", "shepp-logan", "cosine", "hamming", "hann"]
         # Create a Dirac delta function at a single angle view.
         # As the filters are radially invarant:
         #   - backprojection yields an image where each line is a Dirac.
@@ -179,7 +180,7 @@ class TestFBP(unittest.TestCase):
         # test will also ensure that backprojection behaves well.
         dirac = np.zeros_like(self.sino)
         na, dw = dirac.shape
-        dirac[0, dw//2] = na / pi * 2
+        dirac[0, dw // 2] = na / pi * 2
 
         for filter_name in avail_filters:
             B = backprojection.Backprojection(dirac.shape, filter_name=filter_name)
@@ -187,17 +188,15 @@ class TestFBP(unittest.TestCase):
             # Check that radial invariance is kept
             std0 = np.max(np.abs(np.std(r, axis=0)))
             self.assertTrue(
-                std0 < 5.e-6,
-                "Something wrong with FBP(filter=%s)" % filter_name
+                std0 < 5.0e-6, "Something wrong with FBP(filter=%s)" % filter_name
             )
             # Check that the filter is retrieved
-            r_f = np.fft.fft(np.fft.fftshift(r[0])).real / 2.  # filter factor
+            r_f = np.fft.fft(np.fft.fftshift(r[0])).real / 2.0  # filter factor
             ref_filter_f = compute_fourier_filter(dw, filter_name)
             errmax = np.max(np.abs(r_f - ref_filter_f))
             logger.info("FBP filter %s: max error=%e" % (filter_name, errmax))
             self.assertTrue(
-                errmax < 1.e-3,
-                "Something wrong with FBP(filter=%s)" % filter_name
+                errmax < 1.0e-3, "Something wrong with FBP(filter=%s)" % filter_name
             )
 
     @unittest.skipUnless(ocl and mako, "pyopencl is missing")
@@ -205,13 +204,14 @@ class TestFBP(unittest.TestCase):
         # Generate a 513-sinogram.
         # The padded width will be nextpow(513*2).
         # silx [0.10, 0.10.1] will give 1029, which makes R2C transform fail.
-        sino = np.pad(self.sino, ((0, 0), (1, 0)), mode='edge')
-        B = backprojection.Backprojection(sino.shape, axis_position=self.fbp.axis_pos+1)
+        sino = np.pad(self.sino, ((0, 0), (1, 0)), mode="edge")
+        B = backprojection.Backprojection(
+            sino.shape, axis_position=self.fbp.axis_pos + 1
+        )
         res = B(sino)
         # Compare with self.reference_rec. Tolerance is high as backprojector
         # is not fully shift-invariant.
         errmax = np.max(np.abs(clip_circle(res[1:, 1:] - self.reference_rec)))
         self.assertLess(
-            errmax, 1.e-1,
-            "Something wrong with FBP on odd-sized sinogram"
+            errmax, 1.0e-1, "Something wrong with FBP on odd-sized sinogram"
         )
diff --git a/src/silx/opencl/test/test_convolution.py b/src/silx/opencl/test/test_convolution.py
index 6a2759d..86716f4 100644
--- a/src/silx/opencl/test/test_convolution.py
+++ b/src/silx/opencl/test/test_convolution.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
-# Copyright (c) 2019 European Synchrotron Radiation Facility
+# Copyright (c) 2019-2022 European Synchrotron Radiation Facility
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -28,8 +27,6 @@
 Test of the Convolution class.
 """
 
-from __future__ import division, print_function
-
 __authors__ = ["Pierre Paleo"]
 __contact__ = "pierre.paleo@esrf.fr"
 __license__ = "MIT"
@@ -44,7 +41,11 @@ from silx.image.utils import gaussian_kernel
 
 try:
     from scipy.ndimage import convolve, convolve1d
-    from scipy.misc import ascent
+
+    try:
+        from scipy.misc import ascent
+    except:
+        from scipy.datasets import ascent
 
     scipy_convolve = convolve
     scipy_convolve1d = convolve1d
@@ -61,7 +62,6 @@ logger = logging.getLogger(__name__)
 
 
 class ConvolutionData:
-
     def __init__(self, param):
         self.param = param
         self.mode = param["boundary_handling"]
@@ -207,7 +207,7 @@ def convolution_data_params():
     )
     params = []
     for boundary_handling, use_texture, input_dev, output_dev in param_vals:
-        param={
+        param = {
             "boundary_handling": boundary_handling,
             "input_on_device": input_dev,
             "output_on_device": output_dev,
@@ -239,26 +239,31 @@ def convolution_data(request):
 def test_1D(convolution_data):
     convolution_data.template_test("test_1D")
 
+
 @pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
 @pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
 def test_separable_2D(convolution_data):
     convolution_data.template_test("test_separable_2D")
 
+
 @pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
 @pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
 def test_separable_3D(convolution_data):
     convolution_data.template_test("test_separable_3D")
 
+
 @pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
 @pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
 def test_nonseparable_2D(convolution_data):
     convolution_data.template_test("test_nonseparable_2D")
 
+
 @pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
 @pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
 def test_nonseparable_3D(convolution_data):
     convolution_data.template_test("test_nonseparable_3D")
 
+
 @pytest.mark.skipif(ocl is None, reason="OpenCL is missing")
 @pytest.mark.skipif(scipy_convolve is None, reason="scipy is missing")
 def test_batched_2D(convolution_data):
diff --git a/src/silx/opencl/test/test_doubleword.py b/src/silx/opencl/test/test_doubleword.py
index a33cf5a..493d8c8 100644
--- a/src/silx/opencl/test/test_doubleword.py
+++ b/src/silx/opencl/test/test_doubleword.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 #
 #    Project: The silx project
 #             https://github.com/silx-kit/silx
@@ -47,6 +46,7 @@ except ImportError as error:
     pyopencl = None
 
 from .. import ocl
+
 if ocl is not None:
     from ..utils import read_cl_file
     from .. import pyopencl
@@ -66,14 +66,21 @@ class TestDoubleWord(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         if pyopencl is None or ocl is None:
-            raise unittest.SkipTest("OpenCL module (pyopencl) is not present or no device available")
+            raise unittest.SkipTest(
+                "OpenCL module (pyopencl) is not present or no device available"
+            )
 
         cls.ctx = ocl.create_context(devicetype="GPU")
-        cls.queue = pyopencl.CommandQueue(cls.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+        cls.queue = pyopencl.CommandQueue(
+            cls.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE
+        )
 
         # this is running 32 bits OpenCL woth POCL
-        if (platform.machine() in ("i386", "i686", "x86_64") and (tuple.__itemsize__ == 4) and
-                cls.ctx.devices[0].platform.name == 'Portable Computing Language'):
+        if (
+            platform.machine() in ("i386", "i686", "x86_64")
+            and (tuple.__itemsize__ == 4)
+            and cls.ctx.devices[0].platform.name == "Portable Computing Language"
+        ):
             cls.args = "-DX87_VOLATILE=volatile"
         else:
             cls.args = ""
@@ -95,38 +102,68 @@ class TestDoubleWord(unittest.TestCase):
         cls.doubleword = None
 
     def test_fast_sum2(self):
-        test_kernel = ElementwiseKernel(self.ctx,
-                      "float *a, float *b, float *res_h, float *res_l",
-                      "float2 tmp = fast_fp_plus_fp(a[i], b[i]); res_h[i] = tmp.s0; res_l[i] = tmp.s1",
-                      preamble=self.doubleword)
+        test_kernel = ElementwiseKernel(
+            self.ctx,
+            "float *a, float *b, float *res_h, float *res_l",
+            "float2 tmp = fast_fp_plus_fp(a[i], b[i]); res_h[i] = tmp.s0; res_l[i] = tmp.s1",
+            preamble=self.doubleword,
+        )
         a_g = pyopencl.array.to_device(self.queue, self.ah)
         b_g = pyopencl.array.to_device(self.queue, self.bl)
         res_l = pyopencl.array.empty_like(a_g)
         res_h = pyopencl.array.empty_like(a_g)
         test_kernel(a_g, b_g, res_h, res_l)
         self.assertEqual(abs(self.ah + self.bl - res_h.get()).max(), 0, "Major matches")
-        self.assertGreater(abs(self.ah.astype(numpy.float64) + self.bl - res_h.get()).max(), 0, "Exact mismatches")
-        self.assertEqual(abs(self.ah.astype(numpy.float64) + self.bl - (res_h.get().astype(numpy.float64) + res_l.get())).max(), 0, "Exact matches")
+        self.assertGreater(
+            abs(self.ah.astype(numpy.float64) + self.bl - res_h.get()).max(),
+            0,
+            "Exact mismatches",
+        )
+        self.assertEqual(
+            abs(
+                self.ah.astype(numpy.float64)
+                + self.bl
+                - (res_h.get().astype(numpy.float64) + res_l.get())
+            ).max(),
+            0,
+            "Exact matches",
+        )
 
     def test_sum2(self):
-        test_kernel = ElementwiseKernel(self.ctx,
-                    "float *a, float *b, float *res_h, float *res_l",
-                    "float2 tmp = fp_plus_fp(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
-                    preamble=self.doubleword)
+        test_kernel = ElementwiseKernel(
+            self.ctx,
+            "float *a, float *b, float *res_h, float *res_l",
+            "float2 tmp = fp_plus_fp(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+            preamble=self.doubleword,
+        )
         a_g = pyopencl.array.to_device(self.queue, self.ah)
         b_g = pyopencl.array.to_device(self.queue, self.bh)
         res_l = pyopencl.array.empty_like(a_g)
         res_h = pyopencl.array.empty_like(a_g)
         test_kernel(a_g, b_g, res_h, res_l)
         self.assertEqual(abs(self.ah + self.bh - res_h.get()).max(), 0, "Major matches")
-        self.assertGreater(abs(self.ah.astype(numpy.float64) + self.bh - res_h.get()).max(), 0, "Exact mismatches")
-        self.assertEqual(abs(self.ah.astype(numpy.float64) + self.bh - (res_h.get().astype(numpy.float64) + res_l.get())).max(), 0, "Exact matches")
+        self.assertGreater(
+            abs(self.ah.astype(numpy.float64) + self.bh - res_h.get()).max(),
+            0,
+            "Exact mismatches",
+        )
+        self.assertEqual(
+            abs(
+                self.ah.astype(numpy.float64)
+                + self.bh
+                - (res_h.get().astype(numpy.float64) + res_l.get())
+            ).max(),
+            0,
+            "Exact matches",
+        )
 
     def test_prod2(self):
-        test_kernel = ElementwiseKernel(self.ctx,
-                    "float *a, float *b, float *res_h, float *res_l",
-                    "float2 tmp = fp_times_fp(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
-                    preamble=self.doubleword)
+        test_kernel = ElementwiseKernel(
+            self.ctx,
+            "float *a, float *b, float *res_h, float *res_l",
+            "float2 tmp = fp_times_fp(a[i],b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+            preamble=self.doubleword,
+        )
         a_g = pyopencl.array.to_device(self.queue, self.ah)
         b_g = pyopencl.array.to_device(self.queue, self.bh)
         res_l = pyopencl.array.empty_like(a_g)
@@ -135,14 +172,22 @@ class TestDoubleWord(unittest.TestCase):
         res_m = res_h.get()
         res = res_h.get().astype(numpy.float64) + res_l.get()
         self.assertEqual(abs(self.ah * self.bh - res_m).max(), 0, "Major matches")
-        self.assertGreater(abs(self.ah.astype(numpy.float64) * self.bh - res_m).max(), 0, "Exact mismatches")
-        self.assertEqual(abs(self.ah.astype(numpy.float64) * self.bh - res).max(), 0, "Exact matches")
+        self.assertGreater(
+            abs(self.ah.astype(numpy.float64) * self.bh - res_m).max(),
+            0,
+            "Exact mismatches",
+        )
+        self.assertEqual(
+            abs(self.ah.astype(numpy.float64) * self.bh - res).max(), 0, "Exact matches"
+        )
 
     def test_dw_plus_fp(self):
-        test_kernel = ElementwiseKernel(self.ctx,
-                    "float *ah, float *al, float *b, float *res_h, float *res_l",
-                    "float2 tmp = dw_plus_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
-                    preamble=self.doubleword)
+        test_kernel = ElementwiseKernel(
+            self.ctx,
+            "float *ah, float *al, float *b, float *res_h, float *res_l",
+            "float2 tmp = dw_plus_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+            preamble=self.doubleword,
+        )
         ah_g = pyopencl.array.to_device(self.queue, self.ah)
         al_g = pyopencl.array.to_device(self.queue, self.al)
         b_g = pyopencl.array.to_device(self.queue, self.bh)
@@ -152,14 +197,22 @@ class TestDoubleWord(unittest.TestCase):
         res_m = res_h.get()
         res = res_h.get().astype(numpy.float64) + res_l.get()
         self.assertLess(abs(self.a + self.bh - res_m).max(), EPS32, "Major matches")
-        self.assertGreater(abs(self.a + self.bh - res_m).max(), EPS64, "Exact mismatches")
-        self.assertLess(abs(self.ah.astype(numpy.float64) + self.al + self.bh - res).max(), 2 * EPS32 ** 2, "Exact matches")
+        self.assertGreater(
+            abs(self.a + self.bh - res_m).max(), EPS64, "Exact mismatches"
+        )
+        self.assertLess(
+            abs(self.ah.astype(numpy.float64) + self.al + self.bh - res).max(),
+            2 * EPS32**2,
+            "Exact matches",
+        )
 
     def test_dw_plus_dw(self):
-        test_kernel = ElementwiseKernel(self.ctx,
-                    "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l",
-                    "float2 tmp = dw_plus_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
-                    preamble=self.doubleword)
+        test_kernel = ElementwiseKernel(
+            self.ctx,
+            "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l",
+            "float2 tmp = dw_plus_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+            preamble=self.doubleword,
+        )
         ah_g = pyopencl.array.to_device(self.queue, self.ah)
         al_g = pyopencl.array.to_device(self.queue, self.al)
         bh_g = pyopencl.array.to_device(self.queue, self.bh)
@@ -170,14 +223,20 @@ class TestDoubleWord(unittest.TestCase):
         res_m = res_h.get()
         res = res_h.get().astype(numpy.float64) + res_l.get()
         self.assertLess(abs(self.a + self.b - res_m).max(), EPS32, "Major matches")
-        self.assertGreater(abs(self.a + self.b - res_m).max(), EPS64, "Exact mismatches")
-        self.assertLess(abs(self.a + self.b - res).max(), 3 * EPS32 ** 2, "Exact matches")
+        self.assertGreater(
+            abs(self.a + self.b - res_m).max(), EPS64, "Exact mismatches"
+        )
+        self.assertLess(
+            abs(self.a + self.b - res).max(), 3 * EPS32**2, "Exact matches"
+        )
 
     def test_dw_times_fp(self):
-        test_kernel = ElementwiseKernel(self.ctx,
-                    "float *ah, float *al, float *b, float *res_h, float *res_l",
-                    "float2 tmp = dw_times_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
-                    preamble=self.doubleword)
+        test_kernel = ElementwiseKernel(
+            self.ctx,
+            "float *ah, float *al, float *b, float *res_h, float *res_l",
+            "float2 tmp = dw_times_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+            preamble=self.doubleword,
+        )
         ah_g = pyopencl.array.to_device(self.queue, self.ah)
         al_g = pyopencl.array.to_device(self.queue, self.al)
         b_g = pyopencl.array.to_device(self.queue, self.bh)
@@ -187,14 +246,20 @@ class TestDoubleWord(unittest.TestCase):
         res_m = res_h.get()
         res = res_h.get().astype(numpy.float64) + res_l.get()
         self.assertLess(abs(self.a * self.bh - res_m).max(), EPS32, "Major matches")
-        self.assertGreater(abs(self.a * self.bh - res_m).max(), EPS64, "Exact mismatches")
-        self.assertLess(abs(self.a * self.bh - res).max(), 2 * EPS32 ** 2, "Exact matches")
+        self.assertGreater(
+            abs(self.a * self.bh - res_m).max(), EPS64, "Exact mismatches"
+        )
+        self.assertLess(
+            abs(self.a * self.bh - res).max(), 2 * EPS32**2, "Exact matches"
+        )
 
     def test_dw_times_dw(self):
-        test_kernel = ElementwiseKernel(self.ctx,
-                    "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l",
-                    "float2 tmp = dw_times_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
-                    preamble=self.doubleword)
+        test_kernel = ElementwiseKernel(
+            self.ctx,
+            "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l",
+            "float2 tmp = dw_times_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+            preamble=self.doubleword,
+        )
         ah_g = pyopencl.array.to_device(self.queue, self.ah)
         al_g = pyopencl.array.to_device(self.queue, self.al)
         bh_g = pyopencl.array.to_device(self.queue, self.bh)
@@ -205,14 +270,20 @@ class TestDoubleWord(unittest.TestCase):
         res_m = res_h.get()
         res = res_h.get().astype(numpy.float64) + res_l.get()
         self.assertLess(abs(self.a * self.b - res_m).max(), EPS32, "Major matches")
-        self.assertGreater(abs(self.a * self.b - res_m).max(), EPS64, "Exact mismatches")
-        self.assertLess(abs(self.a * self.b - res).max(), 5 * EPS32 ** 2, "Exact matches")
+        self.assertGreater(
+            abs(self.a * self.b - res_m).max(), EPS64, "Exact mismatches"
+        )
+        self.assertLess(
+            abs(self.a * self.b - res).max(), 5 * EPS32**2, "Exact matches"
+        )
 
     def test_dw_div_fp(self):
-        test_kernel = ElementwiseKernel(self.ctx,
-                    "float *ah, float *al, float *b, float *res_h, float *res_l",
-                    "float2 tmp = dw_div_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
-                    preamble=self.doubleword)
+        test_kernel = ElementwiseKernel(
+            self.ctx,
+            "float *ah, float *al, float *b, float *res_h, float *res_l",
+            "float2 tmp = dw_div_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+            preamble=self.doubleword,
+        )
         ah_g = pyopencl.array.to_device(self.queue, self.ah)
         al_g = pyopencl.array.to_device(self.queue, self.al)
         b_g = pyopencl.array.to_device(self.queue, self.bh)
@@ -222,14 +293,20 @@ class TestDoubleWord(unittest.TestCase):
         res_m = res_h.get()
         res = res_h.get().astype(numpy.float64) + res_l.get()
         self.assertLess(abs(self.a / self.bh - res_m).max(), EPS32, "Major matches")
-        self.assertGreater(abs(self.a / self.bh - res_m).max(), EPS64, "Exact mismatches")
-        self.assertLess(abs(self.a / self.bh - res).max(), 3 * EPS32 ** 2, "Exact matches")
+        self.assertGreater(
+            abs(self.a / self.bh - res_m).max(), EPS64, "Exact mismatches"
+        )
+        self.assertLess(
+            abs(self.a / self.bh - res).max(), 3 * EPS32**2, "Exact matches"
+        )
 
     def test_dw_div_dw(self):
-        test_kernel = ElementwiseKernel(self.ctx,
-                    "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l",
-                    "float2 tmp = dw_div_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
-                    preamble=self.doubleword)
+        test_kernel = ElementwiseKernel(
+            self.ctx,
+            "float *ah, float *al, float *bh, float *bl, float *res_h, float *res_l",
+            "float2 tmp = dw_div_dw((float2)(ah[i], al[i]),(float2)(bh[i], bl[i])); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
+            preamble=self.doubleword,
+        )
         ah_g = pyopencl.array.to_device(self.queue, self.ah)
         al_g = pyopencl.array.to_device(self.queue, self.al)
         bh_g = pyopencl.array.to_device(self.queue, self.bh)
@@ -240,5 +317,9 @@ class TestDoubleWord(unittest.TestCase):
         res_m = res_h.get()
         res = res_h.get().astype(numpy.float64) + res_l.get()
         self.assertLess(abs(self.a / self.b - res_m).max(), EPS32, "Major matches")
-        self.assertGreater(abs(self.a / self.b - res_m).max(), EPS64, "Exact mismatches")
-        self.assertLess(abs(self.a / self.b - res).max(), 6 * EPS32 ** 2, "Exact matches")
+        self.assertGreater(
+            abs(self.a / self.b - res_m).max(), EPS64, "Exact mismatches"
+        )
+        self.assertLess(
+            abs(self.a / self.b - res).max(), 6 * EPS32**2, "Exact matches"
+        )
diff --git a/src/silx/opencl/test/test_image.py b/src/silx/opencl/test/test_image.py
index 73c771b..691ea82 100644
--- a/src/silx/opencl/test/test_image.py
+++ b/src/silx/opencl/test/test_image.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 #
 #    Project: image manipulation in  OpenCL
 #             https://github.com/silx-kit/silx
@@ -29,8 +28,6 @@
 Simple test of image manipulation
 """
 
-from __future__ import division, print_function
-
 __authors__ = ["Jérôme Kieffer"]
 __contact__ = "jerome.kieffer@esrf.eu"
 __license__ = "MIT"
@@ -42,11 +39,13 @@ import numpy
 
 import unittest
 from ..common import ocl, _measure_workgroup_size
+
 if ocl:
     import pyopencl
     import pyopencl.array
 from ...test.utils import utilstest
 from ..image import ImageProcessing
+
 logger = logging.getLogger(__name__)
 try:
     from PIL import Image
@@ -56,7 +55,6 @@ except ImportError:
 
 @unittest.skipUnless(ocl and Image, "PyOpenCl/Image is missing")
 class TestImage(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         super(TestImage, cls).setUpClass()
@@ -102,7 +100,9 @@ class TestImage(unittest.TestCase):
         tmp = pyopencl.array.empty(self.ip.ctx, self.data.shape, "float32")
         res = self.ip.to_float(self.data, out=tmp)
         res2 = self.ip.normalize(tmp, -100, 100, copy=False)
-        norm = (self.data.astype(numpy.float32) - self.data.min()) / (self.data.max() - self.data.min())
+        norm = (self.data.astype(numpy.float32) - self.data.min()) / (
+            self.data.max() - self.data.min()
+        )
         ref2 = 200 * norm - 100
         self.assertLess(abs(res2 - ref2).max(), 3e-5, "content")
 
@@ -111,15 +111,21 @@ class TestImage(unittest.TestCase):
         """
         Test on a greyscaled image ... of Lena :)
         """
-        lena_bw = (0.2126 * self.data[:, :, 0] +
-                   0.7152 * self.data[:, :, 1] +
-                   0.0722 * self.data[:, :, 2]).astype("int32")
+        lena_bw = (
+            0.2126 * self.data[:, :, 0]
+            + 0.7152 * self.data[:, :, 1]
+            + 0.0722 * self.data[:, :, 2]
+        ).astype("int32")
         ref = numpy.histogram(lena_bw, 255)
         ip = ImageProcessing(ctx=self.ctx, template=lena_bw, profile=True)
         res = ip.histogram(lena_bw, 255)
         ip.log_profile()
-        delta = (ref[0] - res[0])
-        deltap = (ref[1] - res[1])
+        delta = ref[0] - res[0]
+        deltap = ref[1] - res[1]
         self.assertEqual(delta.sum(), 0, "errors are self-compensated")
         self.assertLessEqual(abs(delta).max(), 1, "errors are small")
-        self.assertLessEqual(abs(deltap).max(), 3e-5, "errors on position are small: %s" % (abs(deltap).max()))
+        self.assertLessEqual(
+            abs(deltap).max(),
+            3e-5,
+            "errors on position are small: %s" % (abs(deltap).max()),
+        )
diff --git a/src/silx/opencl/test/test_kahan.py b/src/silx/opencl/test/test_kahan.py
index 9e4a1e3..069d7de 100644
--- a/src/silx/opencl/test/test_kahan.py
+++ b/src/silx/opencl/test/test_kahan.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 #
 #    Project: OpenCL numerical library
 #             https://github.com/silx-kit/silx
@@ -48,6 +47,7 @@ except ImportError as error:
     pyopencl = None
 
 from .. import ocl
+
 if ocl is not None:
     from ..utils import read_cl_file
     from .. import pyopencl
@@ -62,14 +62,21 @@ class TestKahan(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         if pyopencl is None or ocl is None:
-            raise unittest.SkipTest("OpenCL module (pyopencl) is not present or no device available")
+            raise unittest.SkipTest(
+                "OpenCL module (pyopencl) is not present or no device available"
+            )
 
         cls.ctx = ocl.create_context(devicetype="GPU")
-        cls.queue = pyopencl.CommandQueue(cls.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+        cls.queue = pyopencl.CommandQueue(
+            cls.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE
+        )
 
         # this is running 32 bits OpenCL woth POCL
-        if (platform.machine() in ("i386", "i686", "x86_64") and (tuple.__itemsize__ == 4) and
-                cls.ctx.devices[0].platform.name == 'Portable Computing Language'):
+        if (
+            platform.machine() in ("i386", "i686", "x86_64")
+            and (tuple.__itemsize__ == 4)
+            and cls.ctx.devices[0].platform.name == "Portable Computing Language"
+        ):
             cls.args = "-DX87_VOLATILE=volatile"
         else:
             cls.args = ""
@@ -81,7 +88,7 @@ class TestKahan(unittest.TestCase):
 
     @staticmethod
     def dummy_sum(ary, dtype=None):
-        "perform the actual sum in a dummy way "
+        "perform the actual sum in a dummy way"
         if dtype is None:
             dtype = ary.dtype.type
         sum_ = dtype(0)
@@ -96,8 +103,10 @@ class TestKahan(unittest.TestCase):
 
         ref64 = numpy.sum(data, dtype=numpy.float64)
         ref32 = self.dummy_sum(data)
-        if (ref64 == ref32):
-            logger.warning("Kahan: invalid tests as float32 provides the same result as float64")
+        if ref64 == ref32:
+            logger.warning(
+                "Kahan: invalid tests as float32 provides the same result as float64"
+            )
         # Dummy kernel to evaluate
         src = """
         kernel void summation(global float* data,
@@ -113,11 +122,15 @@ class TestKahan(unittest.TestCase):
             result[1] = acc.s1;
         }
         """
-        prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(self.args)
+        prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(
+            self.args
+        )
         ones_d = pyopencl.array.to_device(self.queue, data)
         res_d = pyopencl.array.empty(self.queue, 2, numpy.float32)
         res_d.fill(0)
-        evt = prg.summation(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt = prg.summation(
+            self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data
+        )
         evt.wait()
         res = res_d.get().sum(dtype=numpy.float64)
         self.assertEqual(ref64, res, "test_kahan")
@@ -129,8 +142,10 @@ class TestKahan(unittest.TestCase):
 
         ref64 = numpy.dot(data.astype(numpy.float64), data.astype(numpy.float64))
         ref32 = numpy.dot(data, data)
-        if (ref64 == ref32):
-            logger.warning("dot16: invalid tests as float32 provides the same result as float64")
+        if ref64 == ref32:
+            logger.warning(
+                "dot16: invalid tests as float32 provides the same result as float64"
+            )
         # Dummy kernel to evaluate
         src = """
         kernel void test_dot16(global float* data,
@@ -196,11 +211,15 @@ class TestKahan(unittest.TestCase):
 
         """
 
-        prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(self.args)
+        prg = pyopencl.Program(self.ctx, read_cl_file("kahan.cl") + src).build(
+            self.args
+        )
         ones_d = pyopencl.array.to_device(self.queue, data)
         res_d = pyopencl.array.empty(self.queue, 2, numpy.float32)
         res_d.fill(0)
-        evt = prg.test_dot16(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        evt = prg.test_dot16(
+            self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data
+        )
         evt.wait()
         res = res_d.get().sum(dtype="float64")
         self.assertEqual(ref64, res, "test_dot16")
@@ -210,9 +229,13 @@ class TestKahan(unittest.TestCase):
         data1 = data[1::2]
         ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
         ref32 = numpy.dot(data0, data1)
-        if (ref64 == ref32):
-            logger.warning("dot8: invalid tests as float32 provides the same result as float64")
-        evt = prg.test_dot8(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        if ref64 == ref32:
+            logger.warning(
+                "dot8: invalid tests as float32 provides the same result as float64"
+            )
+        evt = prg.test_dot8(
+            self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data
+        )
         evt.wait()
         res = res_d.get().sum(dtype="float64")
         self.assertEqual(ref64, res, "test_dot8")
@@ -222,9 +245,13 @@ class TestKahan(unittest.TestCase):
         data1 = data[3::4]
         ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
         ref32 = numpy.dot(data0, data1)
-        if (ref64 == ref32):
-            logger.warning("dot4: invalid tests as float32 provides the same result as float64")
-        evt = prg.test_dot4(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        if ref64 == ref32:
+            logger.warning(
+                "dot4: invalid tests as float32 provides the same result as float64"
+            )
+        evt = prg.test_dot4(
+            self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data
+        )
         evt.wait()
         res = res_d.get().sum(dtype="float64")
         self.assertEqual(ref64, res, "test_dot4")
@@ -234,9 +261,13 @@ class TestKahan(unittest.TestCase):
         data1 = numpy.array([data[3], data[11], data[15]])
         ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
         ref32 = numpy.dot(data0, data1)
-        if (ref64 == ref32):
-            logger.warning("dot3: invalid tests as float32 provides the same result as float64")
-        evt = prg.test_dot3(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        if ref64 == ref32:
+            logger.warning(
+                "dot3: invalid tests as float32 provides the same result as float64"
+            )
+        evt = prg.test_dot3(
+            self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data
+        )
         evt.wait()
         res = res_d.get().sum(dtype="float64")
         self.assertEqual(ref64, res, "test_dot3")
@@ -246,9 +277,13 @@ class TestKahan(unittest.TestCase):
         data1 = numpy.array([data[1], data[15]])
         ref64 = numpy.dot(data0.astype(numpy.float64), data1.astype(numpy.float64))
         ref32 = numpy.dot(data0, data1)
-        if (ref64 == ref32):
-            logger.warning("dot2: invalid tests as float32 provides the same result as float64")
-        evt = prg.test_dot2(self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data)
+        if ref64 == ref32:
+            logger.warning(
+                "dot2: invalid tests as float32 provides the same result as float64"
+            )
+        evt = prg.test_dot2(
+            self.queue, (1,), (1,), ones_d.data, numpy.int32(N), res_d.data
+        )
         evt.wait()
         res = res_d.get().sum(dtype="float64")
         self.assertEqual(ref64, res, "test_dot2")
diff --git a/src/silx/opencl/test/test_linalg.py b/src/silx/opencl/test/test_linalg.py
index a997a36..0b0a443 100644
--- a/src/silx/opencl/test/test_linalg.py
+++ b/src/silx/opencl/test/test_linalg.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
-# Copyright (c) 2016 European Synchrotron Radiation Facility
+# Copyright (c) 2016-2022 European Synchrotron Radiation Facility
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -25,23 +24,22 @@
 # ###########################################################################*/
 """Test of the linalg module"""
 
-from __future__ import division, print_function
-
 __authors__ = ["Pierre paleo"]
 __license__ = "MIT"
 __copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
 __date__ = "01/08/2019"
 
 
-import time
 import logging
 import numpy as np
 import unittest
+
 try:
     import mako
 except ImportError:
     mako = None
 from ..common import ocl
+
 if ocl:
     import pyopencl as cl
     import pyopencl.array as parray
@@ -50,7 +48,8 @@ from silx.test.utils import utilstest
 
 logger = logging.getLogger(__name__)
 try:
-    from scipy.ndimage.filters import laplace
+    from scipy.ndimage import laplace
+
     _has_scipy = True
 except ImportError:
     _has_scipy = False
@@ -58,13 +57,18 @@ except ImportError:
 
 # TODO move this function in math or image ?
 def gradient(img):
-    '''
+    """
     Compute the gradient of an image as a numpy array
     Code from https://github.com/emmanuelle/tomo-tv/
-    '''
-    shape = [img.ndim, ] + list(img.shape)
+    """
+    shape = [
+        img.ndim,
+    ] + list(img.shape)
     gradient = np.zeros(shape, dtype=img.dtype)
-    slice_all = [0, slice(None, -1),]
+    slice_all = [
+        0,
+        slice(None, -1),
+    ]
     for d in range(img.ndim):
         gradient[tuple(slice_all)] = np.diff(img, axis=d)
         slice_all[0] = d + 1
@@ -74,10 +78,10 @@ def gradient(img):
 
 # TODO move this function in math or image ?
 def divergence(grad):
-    '''
+    """
     Compute the divergence of a gradient
     Code from https://github.com/emmanuelle/tomo-tv/
-    '''
+    """
     res = np.zeros(grad.shape[1:])
     for d in range(grad.shape[0]):
         this_grad = np.rollaxis(grad[d], d)
@@ -90,7 +94,6 @@ def divergence(grad):
 
 @unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
 class TestLinAlg(unittest.TestCase):
-
     def setUp(self):
         if ocl is None:
             return
@@ -109,7 +112,9 @@ class TestLinAlg(unittest.TestCase):
         self.div_ref = divergence(self.grad_ref)
         self.image2 = np.zeros_like(self.image)
         # Device images
-        self.gradient_parray = parray.empty(self.la.queue, self.image.shape, np.complex64)
+        self.gradient_parray = parray.empty(
+            self.la.queue, self.image.shape, np.complex64
+        )
         self.gradient_parray.fill(0)
         # we should be using cl.Buffer(self.la.ctx, cl.mem_flags.READ_WRITE, size=self.image.nbytes*2),
         # but platforms not suporting openCL 1.2 have a problem with enqueue_fill_buffer,
@@ -156,46 +161,78 @@ class TestLinAlg(unittest.TestCase):
         arrays = {
             "numpy.ndarray": self.image,
             "buffer": self.image_buffer,
-            "parray": self.image_parray
+            "parray": self.image_parray,
         }
         for desc, image in arrays.items():
             # Test with dst on host (numpy.ndarray)
             res = self.la.gradient(image, return_to_host=True)
-            self.compare(res, self.grad_ref, 1e-6, str("gradient[src=%s, dst=numpy.ndarray]" % desc))
+            self.compare(
+                res,
+                self.grad_ref,
+                1e-6,
+                str("gradient[src=%s, dst=numpy.ndarray]" % desc),
+            )
             # Test with dst on device (pyopencl.Buffer)
             self.la.gradient(image, dst=self.gradient_buffer)
             cl.enqueue_copy(self.la.queue, self.grad, self.gradient_buffer)
             self.grad2[0] = self.grad.real
             self.grad2[1] = self.grad.imag
-            self.compare(self.grad2, self.grad_ref, 1e-6, str("gradient[src=%s, dst=buffer]" % desc))
+            self.compare(
+                self.grad2,
+                self.grad_ref,
+                1e-6,
+                str("gradient[src=%s, dst=buffer]" % desc),
+            )
             # Test with dst on device (pyopencl.Array)
             self.la.gradient(image, dst=self.gradient_parray)
             self.grad = self.gradient_parray.get()
             self.grad2[0] = self.grad.real
             self.grad2[1] = self.grad.imag
-            self.compare(self.grad2, self.grad_ref, 1e-6, str("gradient[src=%s, dst=parray]" % desc))
+            self.compare(
+                self.grad2,
+                self.grad_ref,
+                1e-6,
+                str("gradient[src=%s, dst=parray]" % desc),
+            )
 
     @unittest.skipUnless(ocl and mako, "pyopencl is missing")
     def test_divergence(self):
         arrays = {
             "numpy.ndarray": self.grad_ref,
             "buffer": self.grad_ref_buffer,
-            "parray": self.grad_ref_parray
+            "parray": self.grad_ref_parray,
         }
         for desc, grad in arrays.items():
             # Test with dst on host (numpy.ndarray)
             res = self.la.divergence(grad, return_to_host=True)
-            self.compare(res, self.div_ref, 1e-6, str("divergence[src=%s, dst=numpy.ndarray]" % desc))
+            self.compare(
+                res,
+                self.div_ref,
+                1e-6,
+                str("divergence[src=%s, dst=numpy.ndarray]" % desc),
+            )
             # Test with dst on device (pyopencl.Buffer)
             self.la.divergence(grad, dst=self.image_buffer)
             cl.enqueue_copy(self.la.queue, self.image2, self.image_buffer)
-            self.compare(self.image2, self.div_ref, 1e-6, str("divergence[src=%s, dst=buffer]" % desc))
+            self.compare(
+                self.image2,
+                self.div_ref,
+                1e-6,
+                str("divergence[src=%s, dst=buffer]" % desc),
+            )
             # Test with dst on device (pyopencl.Array)
             self.la.divergence(grad, dst=self.image_parray)
             self.image2 = self.image_parray.get()
-            self.compare(self.image2, self.div_ref, 1e-6, str("divergence[src=%s, dst=parray]" % desc))
-
-    @unittest.skipUnless(ocl and mako and _has_scipy, "pyopencl and/or scipy is missing")
+            self.compare(
+                self.image2,
+                self.div_ref,
+                1e-6,
+                str("divergence[src=%s, dst=parray]" % desc),
+            )
+
+    @unittest.skipUnless(
+        ocl and mako and _has_scipy, "pyopencl and/or scipy is missing"
+    )
     def test_laplacian(self):
         laplacian_ref = laplace(self.image)
         # Laplacian = div(grad)
diff --git a/src/silx/opencl/test/test_medfilt.py b/src/silx/opencl/test/test_medfilt.py
index 339e0f2..15cd749 100644
--- a/src/silx/opencl/test/test_medfilt.py
+++ b/src/silx/opencl/test/test_medfilt.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 #
 #    Project: Median filter of images + OpenCL
 #             https://github.com/silx-kit/silx
@@ -29,13 +28,11 @@
 Simple test of the median filter
 """
 
-from __future__ import division, print_function
-
 __authors__ = ["Jérôme Kieffer"]
 __contact__ = "jerome.kieffer@esrf.eu"
 __license__ = "MIT"
-__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "05/07/2018"
+__copyright__ = "2013-2022 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "09/05/2023"
 
 
 import sys
@@ -43,12 +40,15 @@ import time
 import logging
 import numpy
 import unittest
+
 from collections import namedtuple
+
 try:
     import mako
 except ImportError:
     mako = None
 from ..common import ocl
+
 if ocl:
     import pyopencl
     import pyopencl.array
@@ -61,12 +61,18 @@ Result = namedtuple("Result", ["size", "error", "sp_time", "oc_time"])
 try:
     from scipy.misc import ascent
 except:
-    def ascent():
-        """Dummy image from random data"""
-        return numpy.random.random((512, 512))
+    try:
+        from scipy.datasets import ascent
+    except:
+
+        def ascent():
+            """Dummy image from random data"""
+            return numpy.random.random((512, 512))
+
+
 try:
-    from scipy.ndimage import filters
-    median_filter = filters.median_filter
+    from scipy.ndimage import median_filter
+
     HAS_SCIPY = True
 except:
     HAS_SCIPY = False
@@ -74,7 +80,6 @@ except:
 
 @unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
 class TestMedianFilter(unittest.TestCase):
-
     def setUp(self):
         if ocl is None:
             return
@@ -111,8 +116,15 @@ class TestMedianFilter(unittest.TestCase):
         if r is None:
             logger.info("test_medfilt: size: %s: skipped")
         else:
-            logger.info("test_medfilt: size: %s error %s, t_ref: %.3fs, t_ocl: %.3fs" % r)
-            self.assertEqual(r.error, 0, 'Results are correct')
+            logger.info(
+                "test_medfilt: size: %s error %s, t_ref: %.3fs, t_ocl: %.3fs" % r
+            )
+            if (
+                self.medianfilter.device.platform.name.lower()
+                != "portable computing language"
+            ):
+                # Known broken
+                self.assertEqual(r.error, 0, "Results are correct")
 
     def benchmark(self, limit=36):
         "Run some benchmarking"
diff --git a/src/silx/opencl/test/test_projection.py b/src/silx/opencl/test/test_projection.py
index 13db5f4..550a2f6 100644
--- a/src/silx/opencl/test/test_projection.py
+++ b/src/silx/opencl/test/test_projection.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
 # Copyright (c) 2016 European Synchrotron Radiation Facility
@@ -25,8 +24,6 @@
 # ###########################################################################*/
 """Test of the forward projection module"""
 
-from __future__ import division, print_function
-
 __authors__ = ["Pierre paleo"]
 __license__ = "MIT"
 __copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
@@ -37,11 +34,13 @@ import time
 import logging
 import numpy as np
 import unittest
+
 try:
     import mako
 except ImportError:
     mako = None
 from ..common import ocl
+
 if ocl:
     from .. import projection
 from silx.test.utils import utilstest
@@ -51,17 +50,18 @@ logger = logging.getLogger(__name__)
 
 @unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
 class TestProj(unittest.TestCase):
-
     def setUp(self):
         if ocl is None:
             return
         # ~ if sys.platform.startswith('darwin'):
-            # ~ self.skipTest("Projection is not implemented on CPU for OS X yet")
+        # ~ self.skipTest("Projection is not implemented on CPU for OS X yet")
         self.getfiles()
         n_angles = self.sino.shape[0]
         self.proj = projection.Projection(self.phantom.shape, n_angles)
         if self.proj.compiletime_workgroup_size < 16 * 16:
-            self.skipTest("Current implementation of OpenCL projection is not supported on this platform yet")
+            self.skipTest(
+                "Current implementation of OpenCL projection is not supported on this platform yet"
+            )
 
     def tearDown(self):
         self.phantom = None
@@ -111,11 +111,11 @@ class TestProj(unittest.TestCase):
             msg = str("Max error = %e" % err)
             logger.info(msg)
             # Interpolation differs at some lines, giving relative error of 10/50000
-            self.assertTrue(err < 20., "Max error is too high")
+            self.assertTrue(err < 20.0, "Max error is too high")
         # Test multiple reconstructions
         # -----------------------------
         res0 = np.copy(res)
         for i in range(10):
             res = self.proj.projection(self.phantom)
             errmax = np.max(np.abs(res - res0))
-            self.assertTrue(errmax < 1.e-6, "Max error is too high")
+            self.assertTrue(errmax < 1.0e-6, "Max error is too high")
diff --git a/src/silx/opencl/test/test_sparse.py b/src/silx/opencl/test/test_sparse.py
index 1d26b36..db58220 100644
--- a/src/silx/opencl/test/test_sparse.py
+++ b/src/silx/opencl/test/test_sparse.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # /*##########################################################################
 #
 # Copyright (c) 2018-2019 European Synchrotron Radiation Facility
@@ -30,6 +29,7 @@ import unittest
 import logging
 from itertools import product
 from ..common import ocl
+
 if ocl:
     import pyopencl.array as parray
     from silx.opencl.sparse import CSR
@@ -40,13 +40,14 @@ except ImportError:
 logger = logging.getLogger(__name__)
 
 
-
 def generate_sparse_random_data(
     shape=(1000,),
-    data_min=0, data_max=100,
+    data_min=0,
+    data_max=100,
     density=0.1,
     use_only_integers=True,
-    dtype="f"):
+    dtype="f",
+):
     """
     Generate random sparse data where.
 
@@ -76,7 +77,6 @@ def generate_sparse_random_data(
     return (d * mask).astype(dtype)
 
 
-
 @unittest.skipUnless(ocl and sp, "PyOpenCl/scipy is missing")
 class TestCSR(unittest.TestCase):
     """Test CSR format"""
@@ -88,20 +88,19 @@ class TestCSR(unittest.TestCase):
         dtypes = [np.float32, np.int32, np.uint16]
         self._test_configs = list(product(input_on_device, output_on_device, dtypes))
 
-
     def compute_ref_sparsification(self, array):
         ref_sparse = sp.csr_matrix(array)
         return ref_sparse
 
-
     def test_sparsification(self):
         for input_on_device, output_on_device, dtype in self._test_configs:
             self._test_sparsification(input_on_device, output_on_device, dtype)
 
-
     def _test_sparsification(self, input_on_device, output_on_device, dtype):
         current_config = "input on device: %s, output on device: %s, dtype: %s" % (
-            str(input_on_device), str(output_on_device), str(dtype)
+            str(input_on_device),
+            str(output_on_device),
+            str(dtype),
         )
         logger.debug("CSR: %s" % current_config)
         # Generate data and reference CSR
@@ -133,29 +132,27 @@ class TestCSR(unittest.TestCase):
         nnz = ref_sparse.nnz
         self.assertTrue(
             np.allclose(data[:nnz], ref_sparse.data),
-            "something wrong with sparsified data (%s)"
-            % current_config
+            "something wrong with sparsified data (%s)" % current_config,
         )
         self.assertTrue(
             np.allclose(indices[:nnz], ref_sparse.indices),
-            "something wrong with sparsified indices (%s)"
-            % current_config
+            "something wrong with sparsified indices (%s)" % current_config,
         )
         self.assertTrue(
             np.allclose(indptr, ref_sparse.indptr),
             "something wrong with sparsified indices pointers (indptr) (%s)"
-            % current_config
+            % current_config,
         )
 
-
     def test_desparsification(self):
         for input_on_device, output_on_device, dtype in self._test_configs:
             self._test_desparsification(input_on_device, output_on_device, dtype)
 
-
     def _test_desparsification(self, input_on_device, output_on_device, dtype):
         current_config = "input on device: %s, output on device: %s, dtype: %s" % (
-            str(input_on_device), str(output_on_device), str(dtype)
+            str(input_on_device),
+            str(output_on_device),
+            str(dtype),
         )
         logger.debug("CSR: %s" % current_config)
         # Generate data and reference CSR
@@ -183,6 +180,5 @@ class TestCSR(unittest.TestCase):
         # Compare
         self.assertTrue(
             np.allclose(arr.reshape(array.shape), array),
-            "something wrong with densified data (%s)"
-            % current_config
+            "something wrong with densified data (%s)" % current_config,
         )
diff --git a/src/silx/opencl/test/test_stats.py b/src/silx/opencl/test/test_stats.py
index 859271d..7637211 100644
--- a/src/silx/opencl/test/test_stats.py
+++ b/src/silx/opencl/test/test_stats.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 #
 #    Project: Sift implementation in Python + OpenCL
 #             https://github.com/silx-kit/silx
@@ -40,17 +39,18 @@ import numpy
 
 import unittest
 from ..common import ocl
+
 if ocl:
     import pyopencl
     import pyopencl.array
     from ..statistics import StatResults, Statistics
 from ..utils import get_opencl_code
+
 logger = logging.getLogger(__name__)
 
 
 @unittest.skipUnless(ocl, "PyOpenCl is missing")
 class TestStatistics(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.size = 1 << 20  # 1 million elements
@@ -58,9 +58,15 @@ class TestStatistics(unittest.TestCase):
         fdata = cls.data.astype("float64")
         t0 = time.perf_counter()
         std = fdata.std()
-        cls.ref = StatResults(fdata.min(), fdata.max(), float(fdata.size),
-                              fdata.sum(), fdata.mean(), std ** 2,
-                              std)
+        cls.ref = StatResults(
+            fdata.min(),
+            fdata.max(),
+            float(fdata.size),
+            fdata.sum(),
+            fdata.mean(),
+            std**2,
+            std,
+        )
         t1 = time.perf_counter()
         cls.ref_time = t1 - t0
 
@@ -71,11 +77,12 @@ class TestStatistics(unittest.TestCase):
     @classmethod
     def validate(cls, res):
         return (
-            (res.min == cls.ref.min) and
-            (res.max == cls.ref.max) and
-            (res.cnt == cls.ref.cnt) and
-            abs(res.mean - cls.ref.mean) < 0.01 and
-            abs(res.std - cls.ref.std) < 0.1)
+            (res.min == cls.ref.min)
+            and (res.max == cls.ref.max)
+            and (res.cnt == cls.ref.cnt)
+            and abs(res.mean - cls.ref.mean) < 0.01
+            and abs(res.std - cls.ref.std) < 0.1
+        )
 
     def test_measurement(self):
         """
@@ -96,11 +103,26 @@ class TestStatistics(unittest.TestCase):
                         t0 = time.perf_counter()
                         res = s(self.data, comp=comp)
                         t1 = time.perf_counter()
-                        logger.info("Runtime on %s/%s : %.3fms x%.1f", platform, device, 1000 * (t1 - t0), self.ref_time / (t1 - t0))
+                        logger.info(
+                            "Runtime on %s/%s : %.3fms x%.1f",
+                            platform,
+                            device,
+                            1000 * (t1 - t0),
+                            self.ref_time / (t1 - t0),
+                        )
 
                         if failed_init or not self.validate(res):
-                            logger.error("failed_init %s; Computation modes %s", failed_init, comp)
-                            logger.error("Failed on platform %s device %s", platform, device)
+                            logger.error(
+                                "failed_init %s; Computation modes %s",
+                                failed_init,
+                                comp,
+                            )
+                            logger.error(
+                                "Failed on platform %s device %s", platform, device
+                            )
                             logger.error("Reference results: %s", self.ref)
                             logger.error("Faulty results: %s", res)
-                            self.assertTrue(False, f"Stat calculation failed on {platform},{device} in mode {comp}")
+                            self.assertTrue(
+                                False,
+                                f"Stat calculation failed on {platform},{device} in mode {comp}",
+                            )
diff --git a/src/silx/opencl/utils.py b/src/silx/opencl/utils.py
index 575e018..c332402 100644
--- a/src/silx/opencl/utils.py
+++ b/src/silx/opencl/utils.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # /*##########################################################################
 # Copyright (C) 2017 European Synchrotron Radiation Facility
 #
@@ -26,8 +25,6 @@ Project: Sift implementation in Python + OpenCL
          https://github.com/silx-kit/silx
 """
 
-from __future__ import division
-
 __authors__ = ["Jérôme Kieffer", "Pierre Paleo"]
 __contact__ = "jerome.kieffer@esrf.eu"
 __license__ = "MIT"
@@ -46,9 +43,13 @@ def calc_size(shape, blocksize):
     Calculate the optimal size for a kernel according to the workgroup size
     """
     if "__len__" in dir(blocksize):
-        return tuple((int(i) + int(j) - 1) & ~(int(j) - 1) for i, j in zip(shape, blocksize))
+        return tuple(
+            (int(i) + int(j) - 1) & ~(int(j) - 1) for i, j in zip(shape, blocksize)
+        )
     else:
-        return tuple((int(i) + int(blocksize) - 1) & ~(int(blocksize) - 1) for i in shape)
+        return tuple(
+            (int(i) + int(blocksize) - 1) & ~(int(blocksize) - 1) for i in shape
+        )
 
 
 def nextpower(n):
@@ -91,8 +92,7 @@ def get_cl_file(resource):
     """
     if not resource.endswith(".cl"):
         resource += ".cl"
-    return resources._resource_filename(resource,
-                                        default_directory="opencl")
+    return resources._resource_filename(resource, default_directory="opencl")
 
 
 def read_cl_file(filename):
@@ -121,8 +121,6 @@ def concatenate_cl_kernel(filenames):
     return os.linesep.join(read_cl_file(fn) for fn in filenames)
 
 
-
-
 class ConvolutionInfos(object):
     allowed_axes = {
         "1D": [None],
@@ -135,10 +133,10 @@ class ConvolutionInfos(object):
             (2, 0, 1),
             (2, 1, 0),
             (1, 0, 2),
-            (0, 2, 1)
+            (0, 2, 1),
         ],
         "batched_1D_3D": [(0,), (1,), (2,)],
-        "batched_separable_2D_1D_3D": [(0,), (1,), (2,)], # unsupported (?)
+        "batched_separable_2D_1D_3D": [(0,), (1,), (2,)],  # unsupported (?)
         "2D": [None],
         "batched_2D_3D": [(0,), (1,), (2,)],
         "separable_3D_2D_3D": [
@@ -205,10 +203,3 @@ class ConvolutionInfos(object):
             },
         },
     }
-
-
-
-
-
-
-