9 files changed, 1658 insertions, 0 deletions
diff --git a/silx/opencl/__init__.py b/silx/opencl/__init__.py
new file mode 100644
index 0000000..b970338
--- /dev/null
+++ b/silx/opencl/__init__.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: S I L X project
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+__author__ = "Jerome Kieffer"
+__contact__ = "Jerome.Kieffer@ESRF.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "15/03/2017"
+__status__ = "stable"
+
+import logging
+
+
+logger = logging.getLogger("silx.opencl")
+
+
+from .common import *
diff --git a/silx/opencl/common.py b/silx/opencl/common.py
new file mode 100644
index 0000000..fcb4efa
--- /dev/null
+++ b/silx/opencl/common.py
@@ -0,0 +1,561 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: S I L X project
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+__author__ = "Jerome Kieffer"
+__contact__ = "Jerome.Kieffer@ESRF.eu"
+__license__ = "MIT"
+__copyright__ = "2012-2017 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "15/03/2017"
+__status__ = "stable"
+__all__ = ["ocl", "pyopencl", "mf", "release_cl_buffers", "allocate_cl_buffers",
+           "measure_workgroup_size", "kernel_workgroup_size"]
+
+import os
+import logging
+
+import numpy
+
+from .utils import get_opencl_code
+
+
+logger = logging.getLogger("silx.opencl")
+
+
+if os.environ.get("SILX_OPENCL") in ["0", "False"]:
+    logger.warning("Use of OpenCL has been disables from environment variable: SILX_OPENCL=0")
+    pyopencl = None
+else:
+    try:
+        import pyopencl
+    except ImportError:
+        logger.warning("Unable to import pyOpenCl. Please install it from: http://pypi.python.org/pypi/pyopencl")
+        pyopencl = None
+        class mf(object):
+            WRITE_ONLY = 1
+            READ_ONLY = 1
+            READ_WRITE = 1
+    else:
+        import pyopencl.array as array
+        mf = pyopencl.mem_flags
+
+
+FLOP_PER_CORE = {"GPU": 64,  # GPU, Fermi at least perform 64 flops per cycle/multicore, G80 were at 24 or 48 ...
+                 "CPU": 4,  # CPU, at least intel's have 4 operation per cycle
+                 "ACC": 8}  # ACC: the Xeon-phi (MIC) appears to be able to process 8 Flops per hyperthreaded-core
+
+# Sources : https://en.wikipedia.org/wiki/CUDA
+NVIDIA_FLOP_PER_CORE = {(1, 0): 24,  # Guessed !
+                        (1, 1): 24,  # Measured on G98 [Quadro NVS 295]
+                        (1, 2): 24,  # Guessed !
+                        (1, 3): 24,  # measured on a GT285 (GT200)
+                        (2, 0): 64,  # Measured on a 580 (GF110)
+                        (2, 1): 96,  # Measured on Quadro2000 GF106GL
+                        (3, 0): 384,  # Guessed!
+                        (3, 5): 384,  # Measured on K20
+                        (3, 7): 384,  # K80: Guessed!
+                        (5, 0): 256,  # Maxwell 4 warps/SM 2 flops/ CU
+                        (5, 2): 256,  # Titan-X
+                        (5, 3): 256,  # TX1
+                        (6, 0): 128,  # GP100
+                        (6, 1): 128,  # GP104
+                        (6, 2): 128,  # ?
+                        (7, 0): 256,  # Volta ?
+                        (7, 1): 256,  # Volta ?
+                        }
+
+AMD_FLOP_PER_CORE = 160  # Measured on a M7820 10 core, 700MHz 1120GFlops
+
+
+class Device(object):
+    """
+    Simple class that contains the structure of an OpenCL device
+    """
+    def __init__(self, name="None", dtype=None, version=None, driver_version=None,
+                 extensions="", memory=None, available=None,
+                 cores=None, frequency=None, flop_core=None, idx=0, workgroup=1):
+        """
+        Simple container with some important data for the OpenCL device description.
+
+        :param name: name of the device
+        :param dtype: device type: CPU/GPU/ACC...
+        :param version: driver version
+        :param driver_version:
+        :param extensions: List of opencl extensions
+        :param memory: maximum memory available on the device
+        :param available: is the device deactivated or not
+        :param cores: number of SM/cores
+        :param frequency: frequency of the device
+        :param flop_core: Flopating Point operation per core per cycle
+        :param idx: index of the device within the platform
+        :param workgroup: max workgroup size
+        """
+        self.name = name.strip()
+        self.type = dtype
+        self.version = version
+        self.driver_version = driver_version
+        self.extensions = extensions.split()
+        self.memory = memory
+        self.available = available
+        self.cores = cores
+        self.frequency = frequency
+        self.id = idx
+        self.max_work_group_size = workgroup
+        if not flop_core:
+            flop_core = FLOP_PER_CORE.get(dtype, 1)
+        if cores and frequency:
+            self.flops = cores * frequency * flop_core
+        else:
+            self.flops = flop_core
+
+    def __repr__(self):
+        return "%s" % self.name
+
+    def pretty_print(self):
+        """
+        Complete device description
+
+        :return: string
+        """
+        lst = ["Name\t\t:\t%s" % self.name,
+               "Type\t\t:\t%s" % self.type,
+               "Memory\t\t:\t%.3f MB" % (self.memory / 2.0 ** 20),
+               "Cores\t\t:\t%s CU" % self.cores,
+               "Frequency\t:\t%s MHz" % self.frequency,
+               "Speed\t\t:\t%.3f GFLOPS" % (self.flops / 1000.),
+               "Version\t\t:\t%s" % self.version,
+               "Available\t:\t%s" % self.available]
+        return os.linesep.join(lst)
+
+
+class Platform(object):
+    """
+    Simple class that contains the structure of an OpenCL platform
+    """
+    def __init__(self, name="None", vendor="None", version=None, extensions=None, idx=0):
+        """
+        Class containing all descriptions of a platform and all devices description within that platform.
+
+        :param name: platform name
+        :param vendor: name of the brand/vendor
+        :param version:
+        :param extensions: list of the extension provided by the platform to all of its devices
+        :param idx: index of the platform
+        """
+        self.name = name.strip()
+        self.vendor = vendor.strip()
+        self.version = version
+        self.extensions = extensions.split()
+        self.devices = []
+        self.id = idx
+
+    def __repr__(self):
+        return "%s" % self.name
+
+    def add_device(self, device):
+        """
+        Add new device to the platform
+
+        :param device: Device instance
+        """
+        self.devices.append(device)
+
+    def get_device(self, key):
+        """
+        Return a device according to key
+
+        :param key: identifier for a device, either it's id (int) or it's name
+        :type key: int or str
+        """
+        out = None
+        try:
+            devid = int(key)
+        except ValueError:
+            for a_dev in self.devices:
+                if a_dev.name == key:
+                    out = a_dev
+        else:
+            if len(self.devices) > devid > 0:
+                out = self.devices[devid]
+        return out
+
+
+def _measure_workgroup_size(device_or_context, fast=False):
+    """Mesure the maximal work group size of the given device
+
+    :param device_or_context: instance of pyopencl.Device or pyopencl.Context
+                    or 2-tuple (platformid,deviceid)
+    :param fast: ask the kernel the valid value, don't probe it
+    :return: maximum size for the workgroup
+    """
+    if isinstance(device_or_context, pyopencl.Device):
+        ctx = pyopencl.Context(devices=[device_or_context])
+        device = device_or_context
+    elif isinstance(device_or_context, pyopencl.Context):
+        ctx = device_or_context
+        device = device_or_context.devices[0]
+    elif isinstance(device_or_context, (tuple, list)) and len(device_or_context) == 2:
+        ctx = ocl.create_context(platformid=device_or_context[0],
+                                 deviceid=device_or_context[1])
+        device = ctx.devices[0]
+    else:
+        raise RuntimeError("""given parameter device_or_context is not an
+            instanciation of a device or a context""")
+    shape = device.max_work_group_size
+    # get the context
+
+    assert ctx is not None
+    queue = pyopencl.CommandQueue(ctx)
+
+    max_valid_wg = 1
+    data = numpy.random.random(shape).astype(numpy.float32)
+    d_data = pyopencl.array.to_device(queue, data)
+    d_data_1 = pyopencl.array.zeros_like(d_data) + 1
+
+    program = pyopencl.Program(ctx, get_opencl_code("addition")).build()
+    if fast:
+        max_valid_wg = program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, device)
+    else:
+        maxi = int(round(numpy.log2(shape)))
+        for i in range(maxi + 1):
+            d_res = pyopencl.array.empty_like(d_data)
+            wg = 1 << i
+            try:
+                evt = program.addition(
+                    queue, (shape,), (wg,),
+                    d_data.data, d_data_1.data, d_res.data, numpy.int32(shape))
+                evt.wait()
+            except Exception as error:
+                logger.info("%s on device %s for WG=%s/%s", error, device.name, wg, shape)
+                program = queue = d_res = d_data_1 = d_data = None
+                break
+            else:
+                res = d_res.get()
+                good = numpy.allclose(res, data + 1)
+                if good:
+                    if wg > max_valid_wg:
+                        max_valid_wg = wg
+                else:
+                    logger.warning("ArithmeticError on %s for WG=%s/%s", wg, device.name, shape)
+
+    return max_valid_wg
+
+
+def _is_nvidia_gpu(vendor, devtype):
+    return (vendor == "NVIDIA Corporation") and (devtype == "GPU")
+
+
+class OpenCL(object):
+    """
+    Simple class that wraps the structure ocl_tools_extended.h
+
+    This is a static class.
+    ocl should be the only instance and shared among all python modules.
+    """
+
+    platforms = []
+    nb_devices = 0
+    context_cache = {}  # key: 2-tuple of int, value: context
+    if pyopencl:
+        platform = device = pypl = devtype = extensions = pydev = None
+        for idx, platform in enumerate(pyopencl.get_platforms()):
+            pypl = Platform(platform.name, platform.vendor, platform.version, platform.extensions, idx)
+            for idd, device in enumerate(platform.get_devices()):
+                ####################################################
+                # Nvidia does not report int64 atomics (we are using) ...
+                # this is a hack around as any nvidia GPU with double-precision supports int64 atomics
+                ####################################################
+                extensions = device.extensions
+                if (pypl.vendor == "NVIDIA Corporation") and ('cl_khr_fp64' in extensions):
+                                extensions += ' cl_khr_int64_base_atomics cl_khr_int64_extended_atomics'
+                try:
+                    devtype = pyopencl.device_type.to_string(device.type).upper()
+                except ValueError:
+                    # pocl does not describe itself as a CPU !
+                    devtype = "CPU"
+                if len(devtype) > 3:
+                    devtype = devtype[:3]
+                if _is_nvidia_gpu(pypl.vendor, devtype) and "compute_capability_major_nv" in dir(device):
+                    comput_cap = device.compute_capability_major_nv, device.compute_capability_minor_nv
+                    flop_core = NVIDIA_FLOP_PER_CORE.get(comput_cap, min(NVIDIA_FLOP_PER_CORE.values()))
+                elif (pypl.vendor == "Advanced Micro Devices, Inc.") and (devtype == "GPU"):
+                    flop_core = AMD_FLOP_PER_CORE
+                elif devtype == "CPU":
+                    flop_core = FLOP_PER_CORE.get(devtype, 1)
+                else:
+                    flop_core = 1
+                workgroup = device.max_work_group_size
+                if (devtype == "CPU") and (pypl.vendor == "Apple"):
+                    logger.warning("For Apple's OpenCL on CPU: Measuring actual valid max_work_goup_size.")
+                    workgroup = _measure_workgroup_size(device, fast=True)
+                if (devtype == "GPU") and os.environ.get("GPU") == "False":
+                    # Environment variable to disable GPU devices
+                    continue
+                pydev = Device(device.name, devtype, device.version, device.driver_version, extensions,
+                               device.global_mem_size, bool(device.available), device.max_compute_units,
+                               device.max_clock_frequency, flop_core, idd, workgroup)
+                pypl.add_device(pydev)
+                nb_devices += 1
+            platforms.append(pypl)
+        del platform, device, pypl, devtype, extensions, pydev
+
+    def __repr__(self):
+        out = ["OpenCL devices:"]
+        for platformid, platform in enumerate(self.platforms):
+            deviceids = ["(%s,%s) %s" % (platformid, deviceid, dev.name)
+                         for deviceid, dev in enumerate(platform.devices)]
+            out.append("[%s] %s: " % (platformid, platform.name) + ", ".join(deviceids))
+        return os.linesep.join(out)
+
+    def get_platform(self, key):
+        """
+        Return a platform according
+
+        :param key: identifier for a platform, either an Id (int) or it's name
+        :type key: int or str
+        """
+        out = None
+        try:
+            platid = int(key)
+        except ValueError:
+            for a_plat in self.platforms:
+                if a_plat.name == key:
+                    out = a_plat
+        else:
+            if len(self.platforms) > platid > 0:
+                out = self.platforms[platid]
+        return out
+
+    def select_device(self, dtype="ALL", memory=None, extensions=None, best=True, **kwargs):
+        """
+        Select a device based on few parameters (at the end, keep the one with most memory)
+
+        :param dtype: "gpu" or "cpu" or "all" ....
+        :param memory: minimum amount of memory (int)
+        :param extensions: list of extensions to be present
+        :param best: shall we look for the
+        """
+        if extensions is None:
+            extensions = []
+        if "type" in kwargs:
+            dtype = kwargs["type"].upper()
+        else:
+            dtype = dtype.upper()
+        if len(dtype) > 3:
+            dtype = dtype[:3]
+        best_found = None
+        for platformid, platform in enumerate(self.platforms):
+            for deviceid, device in enumerate(platform.devices):
+                if (dtype in ["ALL", "DEF"]) or (device.type == dtype):
+                    if (memory is None) or (memory <= device.memory):
+                        found = True
+                        for ext in extensions:
+                            if ext not in device.extensions:
+                                found = False
+                        if found:
+                            if not best:
+                                return platformid, deviceid
+                            else:
+                                if not best_found:
+                                    best_found = platformid, deviceid, device.flops
+                                elif best_found[2] < device.flops:
+                                    best_found = platformid, deviceid, device.flops
+        if best_found:
+            return best_found[0], best_found[1]
+
+    def create_context(self, devicetype="ALL", useFp64=False, platformid=None,
+                       deviceid=None, cached=True):
+        """
+        Choose a device and initiate a context.
+
+        Devicetypes can be GPU,gpu,CPU,cpu,DEF,ACC,ALL.
+        Suggested are GPU,CPU.
+        For each setting to work there must be such an OpenCL device and properly installed.
+        E.g.: If Nvidia driver is installed, GPU will succeed but CPU will fail.
+              The AMD SDK kit is required for CPU via OpenCL.
+        :param devicetype: string in ["cpu","gpu", "all", "acc"]
+        :param useFp64: boolean specifying if double precision will be used
+        :param platformid: integer
+        :param deviceid: integer
+        :param cached: True if we want to cache the context
+        :return: OpenCL context on the selected device
+        """
+        if (platformid is not None) and (deviceid is not None):
+            platformid = int(platformid)
+            deviceid = int(deviceid)
+        else:
+            if useFp64:
+                ids = ocl.select_device(type=devicetype, extensions=["cl_khr_int64_base_atomics"])
+            else:
+                ids = ocl.select_device(dtype=devicetype)
+            if ids:
+                platformid = ids[0]
+                deviceid = ids[1]
+        if (platformid is not None) and (deviceid is not None):
+            if (platformid, deviceid) in self.context_cache:
+                ctx = self.context_cache[(platformid, deviceid)]
+            else:
+                ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
+                if cached:
+                    self.context_cache[(platformid, deviceid)] = ctx
+        else:
+            logger.warning("Last chance to get an OpenCL device ... probably not the one requested")
+            ctx = pyopencl.create_some_context(interactive=False)
+        return ctx
+
+    def device_from_context(self, context):
+        """
+        Retrieves the Device from the context
+
+        :param context: OpenCL context
+        :return: instance of Device
+        """
+        odevice = context.devices[0]
+        oplat = odevice.platform
+        device_id = oplat.get_devices().index(odevice)
+        platform_id = pyopencl.get_platforms().index(oplat)
+        return self.platforms[platform_id].devices[device_id]
+
+if pyopencl:
+    ocl = OpenCL()
+    if ocl.nb_devices == 0:
+        ocl = None
+else:
+    ocl = None
+
+
+def release_cl_buffers(cl_buffers):
+    """
+    :param cl_buffers: the buffer you want to release
+    :type cl_buffers: dict(str, pyopencl.Buffer)
+
+    This method release the memory of the buffers store in the dict
+    """
+    for key, buffer_ in cl_buffers.items():
+        if buffer_ is not None:
+            if isinstance(buffer_, pyopencl.array.Array):
+                try:
+                    buffer_.data.release()
+                except pyopencl.LogicError:
+                    logger.error("Error while freeing buffer %s", key)
+            else:
+                try:
+                    buffer_.release()
+                except pyopencl.LogicError:
+                    logger.error("Error while freeing buffer %s", key)
+            cl_buffers[key] = None
+    return cl_buffers
+
+
+def allocate_cl_buffers(buffers, device=None, context=None):
+    """
+    :param buffers: the buffers info use to create the pyopencl.Buffer
+    :type buffers: list(std, flag, numpy.dtype, int)
+    :param device: one of the context device
+    :param context: opencl contextdevice
+    :return: a dict containing the instanciated pyopencl.Buffer
+    :rtype: dict(str, pyopencl.Buffer)
+
+    This method instanciate the pyopencl.Buffer from the buffers
+    description.
+    """
+    mem = {}
+    if device is None:
+        device = ocl.device_from_context(context)
+
+    # check if enough memory is available on the device
+    ualloc = 0
+    for _, _, dtype, size in buffers:
+        ualloc += numpy.dtype(dtype).itemsize * size
+    memory = device.memory
+    logger.info("%.3fMB are needed on device which has %.3fMB",
+                ualloc / 1.0e6, memory / 1.0e6)
+    if ualloc >= memory:
+        memError = "Fatal error in allocate_buffers."
+        memError += "Not enough device memory for buffers"
+        memError += "(%lu requested, %lu available)" % (ualloc, memory)
+        raise MemoryError(memError)  # noqa
+
+    # do the allocation
+    try:
+        for name, flag, dtype, size in buffers:
+            mem[name] = pyopencl.Buffer(context, flag,
+                                        numpy.dtype(dtype).itemsize * size)
+    except pyopencl.MemoryError as error:
+        release_cl_buffers(mem)
+        raise MemoryError(error)
+
+    return mem
+
+
+def measure_workgroup_size(device):
+    """Measure the actual size of the workgroup
+
+    :param device: device or context or 2-tuple with indexes
+    :return: the actual measured workgroup size
+
+    if device is "all", returns a dict with all devices with their ids as keys.
+    """
+    if (ocl is None) or (device is None):
+        return None
+
+    if isinstance(device, tuple) and (len(device) == 2):
+        # this is probably a tuple (platformid, deviceid)
+        device = ocl.create_context(platformid=device[0], deviceid=device[1])
+
+    if device == "all":
+        res = {}
+        for pid, platform in enumerate(ocl.platforms):
+            for did, _devices in enumerate(platform.devices):
+                tup = (pid, did)
+                res[tup] = measure_workgroup_size(tup)
+    else:
+        res = _measure_workgroup_size(device)
+    return res
+
+
+def kernel_workgroup_size(program, kernel):
+    """Extract the compile time maximum workgroup size
+
+    :param program: OpenCL program
+    :param kernel: kernel or name of the kernel
+    :return: the maximum acceptable workgroup size for the given kernel
+    """
+    assert isinstance(program, pyopencl.Program)
+    if not isinstance(kernel, pyopencl.Kernel):
+        kernel_name = kernel
+        assert kernel in (k.function_name for k in program.all_kernels()), "the kernel exists"
+        kernel = program.__getattr__(kernel_name)
+
+    device = program.devices[0]
+    query_wg = pyopencl.kernel_work_group_info.WORK_GROUP_SIZE
+    return kernel.get_work_group_info(query_wg, device)
+
diff --git a/silx/opencl/medfilt.py b/silx/opencl/medfilt.py
new file mode 100644
index 0000000..90cd49a
--- /dev/null
+++ b/silx/opencl/medfilt.py
@@ -0,0 +1,269 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: Azimuthal integration
+#             https://github.com/silx-kit/pyFAI
+#
+#    Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#  .
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#  .
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+"""A module for performing the 1d, 2d and 3d median filter ...
+
+The target is to mimic the signature of scipy.signal.medfilt and scipy.medfilt2
+
+The first implementation targets 2D implementation where this operation is costly (~10s/2kx2k image)
+"""
+from __future__ import absolute_import, print_function, with_statement, division
+
+
+__author__ = "Jerome Kieffer"
+__license__ = "MIT"
+__date__ = "15/03/2017"
+__copyright__ = "2012-2017, ESRF, Grenoble"
+__contact__ = "jerome.kieffer@esrf.fr"
+
+import logging
+import numpy
+from collections import OrderedDict
+
+from .common import pyopencl, kernel_workgroup_size
+from .processing import EventDescription, OpenclProcessing, BufferDescription
+
+if pyopencl:
+    mf = pyopencl.mem_flags
+else:
+    raise ImportError("pyopencl is not installed")
+logger = logging.getLogger("silx.opencl.medfilt")
+
+
+class MedianFilter2D(OpenclProcessing):
+    """A class for doing median filtering using OpenCL"""
+    buffers = [
+               BufferDescription("result", 1, numpy.float32, mf.WRITE_ONLY),
+               BufferDescription("image_raw", 1, numpy.float32, mf.READ_ONLY),
+               BufferDescription("image", 1, numpy.float32, mf.READ_WRITE),
+               ]
+    kernel_files = ["preprocess.cl", "bitonic.cl", "medfilt.cl"]
+    mapping = {numpy.int8: "s8_to_float",
+               numpy.uint8: "u8_to_float",
+               numpy.int16: "s16_to_float",
+               numpy.uint16: "u16_to_float",
+               numpy.uint32: "u32_to_float",
+               numpy.int32: "s32_to_float"}
+
+    def __init__(self, shape, kernel_size=(3, 3),
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 block_size=None, profile=False
+                 ):
+        """Constructor of the OpenCL 2D median filtering class
+
+        :param shape: shape of the images to treat
+        :param kernel size: 2-tuple of odd values
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param block_size: preferred workgroup size, may vary depending on the outpcome of the compilation
+        :param profile: switch on profiling to be able to profile at the kernel level,
+                        store profiling elements (makes code slightly slower)
+        """
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  block_size=block_size, profile=profile)
+        self.shape = shape
+        self.size = self.shape[0] * self.shape[1]
+        self.kernel_size = self.calc_kernel_size(kernel_size)
+        self.workgroup_size = (self.calc_wg(self.kernel_size), 1)  # 3D kernel
+        self.buffers = [BufferDescription(i.name, i.size * self.size, i.dtype, i.flags)
+                        for i in self.__class__.buffers]
+
+        self.allocate_buffers()
+        self.local_mem = self._get_local_mem(self.workgroup_size[0])
+        OpenclProcessing.compile_kernels(self, self.kernel_files, "-D NIMAGE=%i" % self.size)
+        self.set_kernel_arguments()
+
+    def set_kernel_arguments(self):
+        """Parametrize all kernel arguments
+        """
+        for val in self.mapping.values():
+            self.cl_kernel_args[val] = OrderedDict(((i, self.cl_mem[i]) for i in ("image_raw", "image")))
+        self.cl_kernel_args["medfilt2d"] = OrderedDict((("image", self.cl_mem["image"]),
+                                                        ("result", self.cl_mem["result"]),
+                                                        ("local", self.local_mem),
+                                                        ("khs1", numpy.int32(self.kernel_size[0] // 2)),  # Kernel half-size along dim1 (lines)
+                                                        ("khs2", numpy.int32(self.kernel_size[1] // 2)),  # Kernel half-size along dim2 (columns)
+                                                        ("height", numpy.int32(self.shape[0])),  # Image size along dim1 (lines)
+                                                        ("width", numpy.int32(self.shape[1]))))
+#                                                         ('debug', self.cl_mem["debug"])))  # Image size along dim2 (columns))
+
+    def _get_local_mem(self, wg):
+        return pyopencl.LocalMemory(wg * 32)  # 4byte per float, 8 element per thread
+
+    def send_buffer(self, data, dest):
+        """Send a numpy array to the device, including the cast on the device if possible
+
+        :param data: numpy array with data
+        :param dest: name of the buffer as registered in the class
+        """
+
+        dest_type = numpy.dtype([i.dtype for i in self.buffers if i.name == dest][0])
+        events = []
+        if (data.dtype == dest_type) or (data.dtype.itemsize > dest_type.itemsize):
+            copy_image = pyopencl.enqueue_copy(self.queue, self.cl_mem[dest], numpy.ascontiguousarray(data, dest_type))
+            events.append(EventDescription("copy H->D %s" % dest, copy_image))
+        else:
+            copy_image = pyopencl.enqueue_copy(self.queue, self.cl_mem["image_raw"], numpy.ascontiguousarray(data))
+            kernel = getattr(self.program, self.mapping[data.dtype.type])
+            cast_to_float = kernel(self.queue, (self.size,), None, self.cl_mem["image_raw"], self.cl_mem[dest])
+            events += [EventDescription("copy H->D %s" % dest, copy_image), EventDescription("cast to float", cast_to_float)]
+        if self.profile:
+            self.events += events
+
+    def calc_wg(self, kernel_size):
+        """calculate and return the optimal workgroup size for the first dimension, taking into account
+        the 8-height band
+
+        :param kernel_size: 2-tuple of int, shape of the median window
+        :return: optimal workgroup size
+        """
+        needed_threads = ((kernel_size[0] + 7) // 8) * kernel_size[1]
+        if needed_threads < 8:
+            wg = 8
+        elif needed_threads < 32:
+            wg = 32
+        else:
+            wg = 1 << (int(needed_threads).bit_length())
+        return wg
+
+    def medfilt2d(self, image, kernel_size=None):
+        """Actually apply the median filtering on the image
+
+        :param image: numpy array with the image
+        :param kernel_size: 2-tuple if
+        :return: median-filtered  2D image
+
+
+        Nota: for window size 1x1 -> 7x7     up to 49  /  64 elements in   8 threads, 8elt/th
+                              9x9 -> 15x15   up to 225 / 256 elements in  32 threads, 8elt/th
+                              17x17 -> 21x21 up to 441 / 512 elements in  64 threads, 8elt/th
+
+        TODO: change window size on the fly,
+
+
+        """
+        events = []
+        if kernel_size is None:
+            kernel_size = self.kernel_size
+        else:
+            kernel_size = self.calc_kernel_size(kernel_size)
+        kernel_half_size = kernel_size // numpy.int32(2)
+        # this is the workgroup size
+        wg = self.calc_wg(kernel_size)
+
+        # check for valid work group size:
+        amws = kernel_workgroup_size(self.program, "medfilt2d")
+        logger.warning("max actual workgroup size: %s, expected: %s", amws, wg)
+        if wg > amws:
+            raise RuntimeError("Workgroup size is too big for medfilt2d: %s>%s" % (wg, amws))
+
+        localmem = self._get_local_mem(wg)
+
+        assert image.ndim == 2, "Treat only 2D images"
+        assert image.shape[0] <= self.shape[0], "height is OK"
+        assert image.shape[1] <= self.shape[1], "width is OK"
+
+        with self.sem:
+            self.send_buffer(image, "image")
+
+            kwargs = self.cl_kernel_args["medfilt2d"]
+            kwargs["local"] = localmem
+            kwargs["khs1"] = kernel_half_size[0]
+            kwargs["khs2"] = kernel_half_size[1]
+            kwargs["height"] = numpy.int32(image.shape[0])
+            kwargs["width"] = numpy.int32(image.shape[1])
+#             for k, v in kwargs.items():
+#                 print("%s: %s (%s)" % (k, v, type(v)))
+            mf2d = self.program.medfilt2d(self.queue,
+                                          (wg, image.shape[1]),
+                                          (wg, 1), *list(kwargs.values()))
+            events.append(EventDescription("median filter 2d", mf2d))
+
+            result = numpy.empty(image.shape, numpy.float32)
+            ev = pyopencl.enqueue_copy(self.queue, result, self.cl_mem["result"])
+            events.append(EventDescription("copy D->H result", ev))
+            ev.wait()
+        if self.profile:
+            self.events += events
+        return result
+    __call__ = medfilt2d
+
+    @staticmethod
+    def calc_kernel_size(kernel_size):
+        """format the kernel size to be a 2-length numpy array of int32
+        """
+        kernel_size = numpy.asarray(kernel_size, dtype=numpy.int32)
+        if kernel_size.shape == ():
+            kernel_size = numpy.repeat(kernel_size.item(), 2).astype(numpy.int32)
+        for size in kernel_size:
+            if (size % 2) != 1:
+                raise ValueError("Each element of kernel_size should be odd.")
+        return kernel_size
+
+
+class _MedFilt2d(object):
+    median_filter = None
+
+    @classmethod
+    def medfilt2d(cls, ary, kernel_size=3):
+        """Median filter a 2-dimensional array.
+
+        Apply a median filter to the `input` array using a local window-size
+        given by `kernel_size` (must be odd).
+
+        :param ary: A 2-dimensional input array.
+        :param kernel_size: A scalar or a list of length 2, giving the size of the
+                            median filter window in each dimension.  Elements of
+                            `kernel_size` should be odd.  If `kernel_size` is a scalar,
+                            then this scalar is used as the size in each dimension.
+                            Default is a kernel of size (3, 3).
+        :return: An array the same size as input containing the median filtered
+                result. always work on float32 values
+
+        About the padding:
+
+        * The filling mode in scipy.signal.medfilt2d is zero-padding
+        * This implementation is equivalent to:
+            scipy.ndimage.filters.median_filter(ary, kernel_size, mode="nearest")
+
+        """
+        image = numpy.atleast_2d(ary)
+        shape = numpy.array(image.shape)
+        if cls.median_filter is None:
+            cls.median_filter = MedianFilter2D(image.shape, kernel_size)
+        elif (numpy.array(cls.median_filter.shape) < shape).any():
+            # enlarger the buffer size
+            new_shape = numpy.maximum(numpy.array(cls.median_filter.shape), shape)
+            ctx = cls.median_filter.ctx
+            cls.median_filter = MedianFilter2D(new_shape, kernel_size, ctx=ctx)
+        return cls.median_filter.medfilt2d(image)
+
+medfilt2d = _MedFilt2d.medfilt2d
diff --git a/silx/opencl/processing.py b/silx/opencl/processing.py
new file mode 100644
index 0000000..ca46701
--- /dev/null
+++ b/silx/opencl/processing.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: S I L X project
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+"""
+Common OpenCL abstract base classes for different processing
+"""
+
+from __future__ import absolute_import, print_function, division
+
+
+__author__ = "Jerome Kieffer"
+__contact__ = "Jerome.Kieffer@ESRF.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "03/02/2017"
+__status__ = "stable"
+
+
+import os
+import logging
+import gc
+from collections import namedtuple
+import numpy
+import threading
+from .common import ocl, pyopencl, release_cl_buffers
+from .utils import concatenate_cl_kernel
+
+
+BufferDescription = namedtuple("BufferDescription", ["name", "size", "dtype", "flags"])
+EventDescription = namedtuple("EventDescription", ["name", "event"])
+
+logger = logging.getLogger("pyFAI.opencl.processing")
+
+
+class OpenclProcessing(object):
+    """Abstract class for different types of OpenCL processing.
+
+    This class provides:
+    * Generation of the context, queues, profiling mode
+    * Additional function to allocate/free all buffers declared as static attributes of the class
+    * Functions to compile kernels, cache them and clean them
+    * helper functions to clone the object
+    """
+    # Example of how to create an output buffer of 10 floats
+    buffers = [BufferDescription("output", 10, numpy.float32, None),
+               ]
+    # list of kernel source files to be concatenated before compilation of the program
+    kernel_files = []
+
+    def __init__(self, ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 block_size=None, profile=False):
+        """Constructor of the abstract OpenCL processing class
+
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param block_size: preferred workgroup size, may vary depending on the outpcome of the compilation
+        :param profile: switch on profiling to be able to profile at the kernel level,
+                        store profiling elements (makes code slightly slower)
+        """
+        self.sem = threading.Semaphore()
+        self.profile = None
+        self.events = []  # List with of EventDescription, kept for profiling
+        self.cl_mem = {}  # dict with all buffer allocated
+        self.cl_program = None  # The actual OpenCL program
+        self.cl_kernel_args = {}  # dict with all kernel arguments
+        if ctx:
+            self.ctx = ctx
+        else:
+            self.ctx = ocl.create_context(devicetype=devicetype, platformid=platformid, deviceid=deviceid)
+        device_name = self.ctx.devices[0].name.strip()
+        platform_name = self.ctx.devices[0].platform.name.strip()
+        platform = ocl.get_platform(platform_name)
+        self.device = platform.get_device(device_name)
+        self.cl_kernel_args = {}  # dict with all kernel arguments
+
+        self.set_profiling(profile)
+        self.block_size = block_size
+        self.program = None
+
+    def __del__(self):
+        """Destructor: release all buffers and programs
+        """
+        self.free_kernels()
+        self.free_buffers()
+        self.queue = None
+        self.ctx = None
+        gc.collect()
+
+    def allocate_buffers(self, buffers=None):
+        """
+        Allocate OpenCL buffers required for a specific configuration
+
+        :param buffers: a list of BufferDescriptions, leave to None for
+                        paramatrized buffers.
+
+        Note that an OpenCL context also requires some memory, as well
+        as Event and other OpenCL functionalities which cannot and are
+        not taken into account here.  The memory required by a context
+        varies depending on the device. Typical for GTX580 is 65Mb but
+        for a 9300m is ~15Mb In addition, a GPU will always have at
+        least 3-5Mb of memory in use.  Unfortunately, OpenCL does NOT
+        have a built-in way to check the actual free memory on a
+        device, only the total memory.
+        """
+
+        if buffers is None:
+            buffers = self.buffers
+
+        with self.sem:
+            mem = {}
+
+            # check if enough memory is available on the device
+            ualloc = 0
+            for buf in buffers:
+                ualloc += numpy.dtype(buf.dtype).itemsize * buf.size
+            logger.info("%.3fMB are needed on device which has %.3fMB",
+                        ualloc / 1.0e6, self.device.memory / 1.0e6)
+
+            if ualloc >= self.device.memory:
+                raise MemoryError("Fatal error in allocate_buffers. Not enough "
+                                  " device memory for buffers (%lu requested, %lu available)"
+                                  % (ualloc, self.device.memory))
+
+            # do the allocation
+            try:
+                for buf in buffers:
+                    size = numpy.dtype(buf.dtype).itemsize * buf.size
+                    mem[buf.name] = pyopencl.Buffer(self.ctx, buf.flags, size)
+            except pyopencl.MemoryError as error:
+                release_cl_buffers(mem)
+                raise MemoryError(error)
+
+        self.cl_mem.update(mem)
+
+    def free_buffers(self):
+        """free all device.memory allocated on the device
+        """
+        with self.sem:
+            for key, buf in list(self.cl_mem.items()):
+                if buf is not None:
+                    if isinstance(buf, pyopencl.array.Array):
+                        try:
+                            buf.data.release()
+                        except pyopencl.LogicError:
+                            logger.error("Error while freeing buffer %s", key)
+                    else:
+                        try:
+                            buf.release()
+                        except pyopencl.LogicError:
+                            logger.error("Error while freeing buffer %s", key)
+                    self.cl_mem[key] = None
+
+    def compile_kernels(self, kernel_files=None, compile_options=None):
+        """Call the OpenCL compiler
+
+        :param kernel_files: list of path to the kernel
+            (by default use the one declared in the class)
+        :param compile_options: string of compile options
+        """
+        # concatenate all needed source files into a single openCL module
+        kernel_files = kernel_files or self.kernel_files
+        kernel_src = concatenate_cl_kernel(kernel_files)
+
+        compile_options = compile_options or ""
+        logger.info("Compiling file %s with options %s", kernel_files, compile_options)
+        try:
+            self.program = pyopencl.Program(self.ctx, kernel_src).build(options=compile_options)
+        except (pyopencl.MemoryError, pyopencl.LogicError) as error:
+            raise MemoryError(error)
+
+    def free_kernels(self):
+        """Free all kernels
+        """
+        for kernel in self.cl_kernel_args:
+            self.cl_kernel_args[kernel] = []
+        self.program = None
+
+    def set_profiling(self, value=True):
+        """Switch On/Off the profiling flag of the command queue to allow debugging
+
+        :param value: set to True to enable profiling, or to False to disable it.
+                      Without profiling, the processing is marginally faster
+
+        Profiling information can then be retrieved with the 'log_profile' method
+        """
+        if bool(value) != self.profile:
+            with self.sem:
+                self.profile = bool(value)
+                if self.profile:
+                    self.queue = pyopencl.CommandQueue(self.ctx,
+                        properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+                else:
+                    self.queue = pyopencl.CommandQueue(self.ctx)
+
+    def log_profile(self):
+        """If we are in profiling mode, prints out all timing for every single OpenCL call
+        """
+        t = 0.0
+        out = ["", "Profiling info for OpenCL %s" % self.__class__.__name__]
+        if self.profile:
+            for e in self.events:
+                if "__len__" in dir(e) and len(e) >= 2:
+                    et = 1e-6 * (e[1].profile.end - e[1].profile.start)
+                    out.append("%50s:\t%.3fms" % (e[0], et))
+                    t += et
+
+        out.append("_" * 80)
+        out.append("%50s:\t%.3fms" % ("Total execution time", t))
+        logger.info(os.linesep.join(out))
+        return out
+
+# This should be implemented by concrete class
+#     def __copy__(self):
+#         """Shallow copy of the object
+#
+#         :return: copy of the object
+#         """
+#         return self.__class__((self._data, self._indices, self._indptr),
+#                               self.size, block_size=self.BLOCK_SIZE,
+#                               platformid=self.platform.id,
+#                               deviceid=self.device.id,
+#                               checksum=self.on_device.get("data"),
+#                               profile=self.profile, empty=self.empty)
+#
+#     def __deepcopy__(self, memo=None):
+#         """deep copy of the object
+#
+#         :return: deepcopy of the object
+#         """
+#         if memo is None:
+#             memo = {}
+#         new_csr = self._data.copy(), self._indices.copy(), self._indptr.copy()
+#         memo[id(self._data)] = new_csr[0]
+#         memo[id(self._indices)] = new_csr[1]
+#         memo[id(self._indptr)] = new_csr[2]
+#         new_obj = self.__class__(new_csr, self.size,
+#                                  block_size=self.BLOCK_SIZE,
+#                                  platformid=self.platform.id,
+#                                  deviceid=self.device.id,
+#                                  checksum=self.on_device.get("data"),
+#                                  profile=self.profile, empty=self.empty)
+#         memo[id(self)] = new_obj
+#         return new_obj
diff --git a/silx/opencl/setup.py b/silx/opencl/setup.py
new file mode 100644
index 0000000..d0181cd
--- /dev/null
+++ b/silx/opencl/setup.py
@@ -0,0 +1,44 @@
+# coding: utf-8
+#
+# Copyright (C) 2016 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+
+from __future__ import division
+
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__authors__ = ["J. Kieffer"]
+__date__ = "08/09/2016"
+
+from numpy.distutils.misc_util import Configuration
+
+
+def configuration(parent_package='', top_path=None):
+    config = Configuration('opencl', parent_package, top_path)
+    config.add_subpackage('sift')
+    config.add_subpackage('test')
+    return config
+
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/silx/opencl/test/__init__.py b/silx/opencl/test/__init__.py
new file mode 100644
index 0000000..24aa06e
--- /dev/null
+++ b/silx/opencl/test/__init__.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: silx
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2016  European Synchrotron Radiation Facility, Grenoble, France
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+__authors__ = ["J. Kieffer"]
+__license__ = "MIT"
+__date__ = "15/03/2017"
+
+import unittest
+from . import test_addition
+from . import test_medfilt
+from ..sift import test as test_sift
+
+
+def suite():
+    test_suite = unittest.TestSuite()
+    test_suite.addTests(test_addition.suite())
+    test_suite.addTests(test_medfilt.suite())
+    test_suite.addTests(test_sift.suite())
+
+    return test_suite
diff --git a/silx/opencl/test/test_addition.py b/silx/opencl/test/test_addition.py
new file mode 100644
index 0000000..89e49be
--- /dev/null
+++ b/silx/opencl/test/test_addition.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Sift implementation in Python + OpenCL
+#             https://github.com/silx-kit/silx
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Simple test of an addition
+"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Henri Payno, Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "15/03/2017"
+
+import logging
+import numpy
+
+import unittest
+from ..common import ocl, _measure_workgroup_size
+if ocl:
+    import pyopencl
+    import pyopencl.array
+from ..utils import get_opencl_code
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipUnless(ocl, "PyOpenCl is missing")
+class TestAddition(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestAddition, cls).setUpClass()
+        if ocl:
+            cls.ctx = ocl.create_context()
+            if logger.getEffectiveLevel() <= logging.INFO:
+                cls.PROFILE = True
+                cls.queue = pyopencl.CommandQueue(
+                                cls.ctx,
+                                properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+            else:
+                cls.PROFILE = False
+                cls.queue = pyopencl.CommandQueue(cls.ctx)
+            cls.max_valid_wg = 0
+
+    @classmethod
+    def tearDownClass(cls):
+        super(TestAddition, cls).tearDownClass()
+        print("Maximum valid workgroup size %s on device %s" % (cls.max_valid_wg, cls.ctx.devices[0]))
+        cls.ctx = None
+        cls.queue = None
+
+    def setUp(self):
+        if ocl is None:
+            return
+        self.shape = 4096
+        self.data = numpy.random.random(self.shape).astype(numpy.float32)
+        self.d_array_img = pyopencl.array.to_device(self.queue, self.data)
+        self.d_array_5 = pyopencl.array.zeros_like(self.d_array_img) - 5
+        self.program = pyopencl.Program(self.ctx, get_opencl_code("addition")).build()
+
+    def tearDown(self):
+        self.img = self.data = None
+        self.d_array_img = self.d_array_5 = self.program = None
+
+    @unittest.skipUnless(ocl, "pyopencl is missing")
+    def test_add(self):
+        """
+        tests the addition  kernel
+        """
+        maxi = int(round(numpy.log2(self.shape)))
+        for i in range(maxi):
+            d_array_result = pyopencl.array.empty_like(self.d_array_img)
+            wg = 1 << i
+            try:
+                evt = self.program.addition(self.queue, (self.shape,), (wg,),
+                       self.d_array_img.data, self.d_array_5.data, d_array_result.data, numpy.int32(self.shape))
+                evt.wait()
+            except Exception as error:
+                max_valid_wg = self.program.addition.get_work_group_info(pyopencl.kernel_work_group_info.WORK_GROUP_SIZE, self.ctx.devices[0])
+                msg = "Error %s on WG=%s: %s" % (error, wg, max_valid_wg)
+                self.assertLess(max_valid_wg, wg, msg)
+                break
+            else:
+                res = d_array_result.get()
+                good = numpy.allclose(res, self.data - 5)
+                if good and wg>self.max_valid_wg:
+                    self.__class__.max_valid_wg = wg
+                self.assert_(good, "calculation is correct for WG=%s" % wg)
+
+    @unittest.skipUnless(ocl, "pyopencl is missing")
+    def test_measurement(self):
+        """
+        tests that all devices are working properly ...
+        """
+        for platform in ocl.platforms:
+            for did, device in enumerate(platform.devices):
+                meas = _measure_workgroup_size((platform.id, device.id))
+                self.assertEqual(meas, device.max_work_group_size,
+                                 "Workgroup size for %s/%s: %s == %s" % (platform, device, meas, device.max_work_group_size))
+
+
+def suite():
+    testSuite = unittest.TestSuite()
+    testSuite.addTest(TestAddition("test_add"))
+    testSuite.addTest(TestAddition("test_measurement"))
+    return testSuite
+
+
+if __name__ == '__main__':
+    unittest.main(defaultTest="suite")
diff --git a/silx/opencl/test/test_medfilt.py b/silx/opencl/test/test_medfilt.py
new file mode 100644
index 0000000..f4e4cc8
--- /dev/null
+++ b/silx/opencl/test/test_medfilt.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Median filter of images + OpenCL
+#             https://github.com/silx-kit/silx
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Simple test of the median filter
+"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "15/03/2017"
+
+
+import sys
+import time
+import logging
+import numpy
+import unittest
+from collections import namedtuple
+try:
+    import mako
+except ImportError:
+    mako = None
+from ..common import ocl
+if ocl:
+    import pyopencl
+    import pyopencl.array
+    from .. import medfilt
+
+logger = logging.getLogger(__name__)
+
+Result = namedtuple("Result", ["size", "error", "sp_time", "oc_time"])
+
+try:
+    from scipy.misc import ascent
+except:
+    def ascent():
+        """Dummy image from random data"""
+        return numpy.random.random((512, 512))
+try:
+    from scipy.ndimage import filters
+    median_filter = filters.median_filter
+    HAS_SCIPY = True
+except:
+    HAS_SCIPY = False
+    from silx.math import medfilt2d as median_filter
+
+@unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
+class TestMedianFilter(unittest.TestCase):
+
+    def setUp(self):
+        if ocl is None:
+            return
+        self.data = ascent().astype(numpy.float32)
+        self.medianfilter = medfilt.MedianFilter2D(self.data.shape, devicetype="gpu")
+
+    def tearDown(self):
+        self.data = None
+        self.medianfilter = None
+
+    def measure(self, size):
+        "Common measurement of accuracy and timings"
+        t0 = time.time()
+        if HAS_SCIPY:
+            ref = median_filter(self.data, size, mode="nearest")
+        else:
+            ref = median_filter(self.data, size)
+        t1 = time.time()
+        try:
+            got = self.medianfilter.medfilt2d(self.data, size)
+        except RuntimeError as msg:
+            logger.error(msg)
+            return
+        t2 = time.time()
+        delta = abs(got - ref).max()
+        return Result(size, delta, t1 - t0, t2 - t1)
+
+    @unittest.skipUnless(ocl and mako, "pyopencl is missing")
+    def test_medfilt(self):
+        """
+        tests the median filter kernel
+        """
+        r = self.measure(size=11)
+        if r is None:
+            logger.info("test_medfilt: size: %s: skipped")
+        else:
+            logger.info("test_medfilt: size: %s error %s, t_ref: %.3fs, t_ocl: %.3fs" % r)
+            self.assert_(r.error == 0, 'Results are correct')
+
+    def benchmark(self, limit=36):
+        "Run some benchmarking"
+        try:
+            import PyQt5
+            from ...gui.matplotlib import pylab
+            from ...gui.utils import update_fig
+        except:
+            pylab = None
+
+            def update_fig(*ag, **kwarg):
+                pass
+
+        fig = pylab.figure()
+        fig.suptitle("Median filter of an image 512x512")
+        sp = fig.add_subplot(1, 1, 1)
+        sp.set_title(self.medianfilter.ctx.devices[0].name)
+        sp.set_xlabel("Window width & height")
+        sp.set_ylabel("Execution time (s)")
+        sp.set_xlim(2, limit + 1)
+        sp.set_ylim(0, 4)
+        data_size = []
+        data_scipy = []
+        data_opencl = []
+        plot_sp = sp.plot(data_size, data_scipy, "-or", label="scipy")[0]
+        plot_opencl = sp.plot(data_size, data_opencl, "-ob", label="opencl")[0]
+        sp.legend(loc=2)
+        fig.show()
+        update_fig(fig)
+        for s in range(3, limit, 2):
+            r = self.measure(s)
+            print(r)
+            if r.error == 0:
+                data_size.append(s)
+                data_scipy.append(r.sp_time)
+                data_opencl.append(r.oc_time)
+                plot_sp.set_data(data_size, data_scipy)
+                plot_opencl.set_data(data_size, data_opencl)
+                update_fig(fig)
+        fig.show()
+        if sys.version_info[0] < 3:
+            raw_input()
+        else:
+            input()
+
+
+def suite():
+    testSuite = unittest.TestSuite()
+    testSuite.addTest(TestMedianFilter("test_medfilt"))
+    return testSuite
+
+
+def benchmark():
+    testSuite = unittest.TestSuite()
+    testSuite.addTest(TestMedianFilter("benchmark"))
+    return testSuite
+
+
+if __name__ == '__main__':
+    unittest.main(defaultTest="suite")
diff --git a/silx/opencl/utils.py b/silx/opencl/utils.py
new file mode 100644
index 0000000..58becb0
--- /dev/null
+++ b/silx/opencl/utils.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Sift implementation in Python + OpenCL
+#             https://github.com/silx-kit/silx
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+
+from __future__ import division
+
+__authors__ = ["Jérôme Kieffer", "Pierre Paleo"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "15/03/2017"
+__status__ = "Production" 
+
+import os
+import numpy
+from ..resources import resource_filename
+from math import log, ceil
+
+
+def calc_size(shape, blocksize):
+    """
+    Calculate the optimal size for a kernel according to the workgroup size
+    """
+    if "__len__" in dir(blocksize):
+        return tuple((int(i) + int(j) - 1) & ~(int(j) - 1) for i, j in zip(shape, blocksize))
+    else:
+        return tuple((int(i) + int(blocksize) - 1) & ~(int(blocksize) - 1) for i in shape)
+
+
+def nextpower(n):
+    """Calculate the power of two
+
+    :param n: an integer, for example 100
+    :return: another integer, 100-> 128
+    """
+    return 1 << int(ceil(log(n, 2)))
+
+
+def sizeof(shape, dtype="uint8"):
+    """
+    Calculate the number of bytes needed to allocate for a given structure
+
+    :param shape: size or tuple of sizes
+    :param dtype: data type
+    """
+    itemsize = numpy.dtype(dtype).itemsize
+    cnt = 1
+    if "__len__" in dir(shape):
+        for dim in shape:
+            cnt *= dim
+    else:
+        cnt = int(shape)
+    return cnt * itemsize
+
+
+def get_cl_file(filename):
+    """get the full path of a openCL file
+
+    :return: the full path of the openCL source file
+    """
+    if not filename.endswith(".cl"):
+        filename += ".cl"
+    return resource_filename(os.path.join("opencl", filename))
+
+
+def read_cl_file(filename):
+    """
+    :param filename: read an OpenCL file and apply a preprocessor
+    :return: preprocessed source code
+    """
+    with open(get_cl_file(filename), "r") as f:
+        # Dummy preprocessor which removes the #include
+        lines = [i for i in f.readlines() if not i.startswith("#include ")]
+    return "".join(lines)
+
+get_opencl_code = read_cl_file
+
+
+def concatenate_cl_kernel(filenames):
+    """Concatenates all the kernel from the list of files
+
+    :param filenames: filenames containing the kernels
+    :type filenames: list of str which can be filename of kernel as a string.
+    :return: a string with all kernels concatenated
+
+    this method concatenates all the kernel from the list
+    """
+    return  os.linesep.join(read_cl_file(fn) for fn in filenames)