16 files changed, 1471 insertions, 82 deletions
diff --git a/silx/opencl/backprojection.py b/silx/opencl/backprojection.py
index 15a03b9..ca7244e 100644
--- a/silx/opencl/backprojection.py
+++ b/silx/opencl/backprojection.py
@@ -29,7 +29,7 @@ from __future__ import absolute_import, print_function, with_statement, division
 
 __authors__ = ["A. Mirone, P. Paleo"]
 __license__ = "MIT"
-__date__ = "05/10/2017"
+__date__ = "19/01/2018"
 
 import logging
 import numpy
@@ -196,7 +196,7 @@ class Backprojection(OpenclProcessing):
         self.local_mem = 256 * 3 * _sizeof(numpy.float32)  # constant for all image sizes
         OpenclProcessing.compile_kernels(self, self.kernel_files)
         # check that workgroup can actually be (16, 16)
-        self.check_workgroup_size("backproj_cpu_kernel")
+        self.compiletime_workgroup_size = self.kernels.max_workgroup_size("backproj_cpu_kernel")
         # Workgroup and ndrange sizes are always the same
         self.wg = (16, 16)
         self.ndrange = (
diff --git a/silx/opencl/codec/__init__.py b/silx/opencl/codec/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/silx/opencl/codec/__init__.py
diff --git a/silx/opencl/codec/byte_offset.py b/silx/opencl/codec/byte_offset.py
new file mode 100644
index 0000000..565b0c5
--- /dev/null
+++ b/silx/opencl/codec/byte_offset.py
@@ -0,0 +1,439 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Sift implementation in Python + OpenCL
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2013-2018  European Synchrotron Radiation Facility, Grenoble, France
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+This module provides a class for CBF byte offset compression/decompression.
+"""
+
+from __future__ import division, print_function, with_statement
+
+__authors__ = ["Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "24/10/2017"
+__status__ = "production"
+
+
+import functools
+import os
+import numpy
+from ..common import ocl, pyopencl
+from ..processing import BufferDescription, EventDescription, OpenclProcessing
+
+import logging
+logger = logging.getLogger(__name__)
+
+if pyopencl:
+    import pyopencl.version
+    if pyopencl.version.VERSION < (2016, 0):
+        from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel
+    else:
+        from pyopencl.algorithm import GenericScanKernel
+        from pyopencl.scan import GenericDebugScanKernel
+else:
+    logger.warning("No PyOpenCL, no byte-offset, please see fabio")
+
+
+class ByteOffset(OpenclProcessing):
+    """Perform the byte offset compression/decompression on the GPU
+
+        See :class:`OpenclProcessing` for optional arguments description.
+
+        :param int raw_size:
+            Size of the raw stream for decompression.
+            It can be (slightly) larger than the array.
+        :param int dec_size:
+            Size of the decompression output array
+            (mandatory for decompression)
+        """
+
+    def __init__(self, raw_size=None, dec_size=None,
+                 ctx=None, devicetype="all",
+                 platformid=None, deviceid=None,
+                 block_size=None, profile=False):
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  block_size=block_size, profile=profile)
+        if self.block_size is None:
+            self.block_size = self.device.max_work_group_size
+        wg = self.block_size
+
+        buffers = [BufferDescription("counter", 1, numpy.int32, None)]
+
+        if raw_size is None:
+            self.raw_size = -1
+            self.padded_raw_size = -1
+        else:
+            self.raw_size = int(raw_size)
+            self.padded_raw_size = int((self.raw_size + wg - 1) & ~(wg - 1))
+            buffers += [
+                BufferDescription("raw", self.padded_raw_size, numpy.int8, None),
+                BufferDescription("mask", self.padded_raw_size, numpy.int32, None),
+                BufferDescription("values", self.padded_raw_size, numpy.int32, None),
+                BufferDescription("exceptions", self.padded_raw_size, numpy.int32, None)
+            ]
+
+        if dec_size is None:
+            self.dec_size = None
+        else:
+            self.dec_size = numpy.int32(dec_size)
+            buffers += [
+                BufferDescription("data_float", self.dec_size, numpy.float32, None),
+                BufferDescription("data_int", self.dec_size, numpy.int32, None)
+            ]
+
+        self.allocate_buffers(buffers, use_array=True)
+
+        self.compile_kernels([os.path.join("codec", "byte_offset")])
+        self.kernels.__setattr__("scan", self._init_double_scan())
+        self.kernels.__setattr__("compression_scan",
+                                 self._init_compression_scan())
+
+    def _init_double_scan(self):
+        """"generates a double scan on indexes and values in one operation"""
+        arguments = "__global int *value", "__global int *index"
+        int2 = pyopencl.tools.get_or_register_dtype("int2")
+        input_expr = "index[i]>0 ? (int2)(0, 0) : (int2)(value[i], 1)"
+        scan_expr = "a+b"
+        neutral = "(int2)(0,0)"
+        output_statement = "value[i] = item.s0; index[i+1] = item.s1;"
+
+        if self.block_size > 256:
+            knl = GenericScanKernel(self.ctx,
+                                    dtype=int2,
+                                    arguments=arguments,
+                                    input_expr=input_expr,
+                                    scan_expr=scan_expr,
+                                    neutral=neutral,
+                                    output_statement=output_statement)
+        else:  # MacOS on CPU
+            knl = GenericDebugScanKernel(self.ctx,
+                                         dtype=int2,
+                                         arguments=arguments,
+                                         input_expr=input_expr,
+                                         scan_expr=scan_expr,
+                                         neutral=neutral,
+                                         output_statement=output_statement)
+        return knl
+
+    def decode(self, raw, as_float=False, out=None):
+        """This function actually performs the decompression by calling the kernels
+
+        :param numpy.ndarray raw: The compressed data as a 1D numpy array of char.
+        :param bool as_float: True to decompress as float32,
+                              False (default) to decompress as int32
+        :param pyopencl.array out: pyopencl array in which to place the result.
+        :return: The decompressed image as an pyopencl array.
+        :rtype: pyopencl.array
+        """
+        assert self.dec_size is not None, \
+            "dec_size is a mandatory ByteOffset init argument for decompression"
+
+        events = []
+        with self.sem:
+            len_raw = numpy.int32(len(raw))
+            if len_raw > self.padded_raw_size:
+                wg = self.block_size
+                self.raw_size = int(len(raw))
+                self.padded_raw_size = (self.raw_size + wg - 1) & ~(wg - 1)
+                logger.info("increase raw buffer size to %s", self.padded_raw_size)
+                buffers = {
+                           "raw": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int8),
+                           "mask": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
+                           "exceptions": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
+                           "values": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
+                          }
+                self.cl_mem.update(buffers)
+            else:
+                wg = self.block_size
+
+            evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data,
+                                        raw,
+                                        is_blocking=False)
+            events.append(EventDescription("copy raw H -> D", evt))
+            evt = self.kernels.fill_int_mem(self.queue, (self.padded_raw_size,), (wg,),
+                                            self.cl_mem["mask"].data,
+                                            numpy.int32(self.padded_raw_size),
+                                            numpy.int32(0),
+                                            numpy.int32(0))
+            events.append(EventDescription("memset mask", evt))
+            evt = self.kernels.fill_int_mem(self.queue, (1,), (1,),
+                                            self.cl_mem["counter"].data,
+                                            numpy.int32(1),
+                                            numpy.int32(0),
+                                            numpy.int32(0))
+            events.append(EventDescription("memset counter", evt))
+            evt = self.kernels.mark_exceptions(self.queue, (self.padded_raw_size,), (wg,),
+                                               self.cl_mem["raw"].data,
+                                               len_raw,
+                                               numpy.int32(self.raw_size),
+                                               self.cl_mem["mask"].data,
+                                               self.cl_mem["values"].data,
+                                               self.cl_mem["counter"].data,
+                                               self.cl_mem["exceptions"].data)
+            events.append(EventDescription("mark exceptions", evt))
+            nb_exceptions = numpy.empty(1, dtype=numpy.int32)
+            evt = pyopencl.enqueue_copy(self.queue, nb_exceptions, self.cl_mem["counter"].data,
+                                        is_blocking=False)
+            events.append(EventDescription("copy counter D -> H", evt))
+            evt.wait()
+            nbexc = int(nb_exceptions[0])
+            if nbexc == 0:
+                logger.info("nbexc %i", nbexc)
+            else:
+                evt = self.kernels.treat_exceptions(self.queue, (nbexc,), (1,),
+                                                    self.cl_mem["raw"].data,
+                                                    len_raw,
+                                                    self.cl_mem["mask"].data,
+                                                    self.cl_mem["exceptions"].data,
+                                                    self.cl_mem["values"].data
+                                                    )
+                events.append(EventDescription("treat_exceptions", evt))
+
+            #self.cl_mem["copy_values"] = self.cl_mem["values"].copy()
+            #self.cl_mem["copy_mask"] = self.cl_mem["mask"].copy()
+            evt = self.kernels.scan(self.cl_mem["values"],
+                                    self.cl_mem["mask"],
+                                    queue=self.queue,
+                                    size=int(len_raw),
+                                    wait_for=(evt,))
+            events.append(EventDescription("double scan", evt))
+            #evt.wait()
+            if out is not None:
+                if out.dtype == numpy.float32:
+                    copy_results = self.kernels.copy_result_float
+                else:
+                    copy_results = self.kernels.copy_result_int
+            else:
+                if as_float:
+                    out = self.cl_mem["data_float"]
+                    copy_results = self.kernels.copy_result_float
+                else:
+                    out = self.cl_mem["data_int"]
+                    copy_results = self.kernels.copy_result_int
+            evt = copy_results(self.queue, (self.padded_raw_size,), (wg,),
+                               self.cl_mem["values"].data,
+                               self.cl_mem["mask"].data,
+                               len_raw,
+                               self.dec_size,
+                               out.data
+                               )
+            events.append(EventDescription("copy_results", evt))
+            #evt.wait()
+            if self.profile:
+                self.events += events
+        return out
+
+    __call__ = decode
+
+    def _init_compression_scan(self):
+        """Initialize CBF compression scan kernels"""
+        preamble = """
+        int compressed_size(int diff) {
+            int abs_diff = abs(diff);
+
+            if (abs_diff < 128) {
+                return 1;
+            }
+            else if (abs_diff < 32768) {
+                return 3;
+            }
+            else {
+                return 7;
+            }
+        }
+
+        void write(const int index,
+                   const int diff,
+                   global char *output) {
+            int abs_diff = abs(diff);
+
+            if (abs_diff < 128) {
+                output[index] = (char) diff;
+            }
+            else if (abs_diff < 32768) {
+                output[index] = -128;
+                output[index + 1] = (char) (diff >> 0);
+                output[index + 2] = (char) (diff >> 8);
+            }
+            else {
+                output[index] = -128;
+                output[index + 1] = 0;
+                output[index + 2] = -128;
+                output[index + 3] = (char) (diff >> 0);
+                output[index + 4] = (char) (diff >> 8);
+                output[index + 5] = (char) (diff >> 16);
+                output[index + 6] = (char) (diff >> 24);
+            }
+        }
+        """
+        arguments = "__global const int *data, __global char *compressed, __global int *size"
+        input_expr = "compressed_size((i == 0) ? data[0] : (data[i] - data[i - 1]))"
+        scan_expr = "a+b"
+        neutral = "0"
+        output_statement = """
+        if (prev_item == 0) { // 1st thread store compressed data size
+            size[0] = last_item;
+        }
+        write(prev_item, (i == 0) ? data[0] : (data[i] - data[i - 1]), compressed);
+        """
+
+        if self.block_size >= 64:
+            knl = GenericScanKernel(self.ctx,
+                                    dtype=numpy.int32,
+                                    preamble=preamble,
+                                    arguments=arguments,
+                                    input_expr=input_expr,
+                                    scan_expr=scan_expr,
+                                    neutral=neutral,
+                                    output_statement=output_statement)
+        else:  # MacOS on CPU
+            knl = GenericDebugScanKernel(self.ctx,
+                                         dtype=numpy.int32,
+                                         preamble=preamble,
+                                         arguments=arguments,
+                                         input_expr=input_expr,
+                                         scan_expr=scan_expr,
+                                         neutral=neutral,
+                                         output_statement=output_statement)
+        return knl
+
+    def encode(self, data, out=None):
+        """Compress data to CBF.
+
+        :param data: The data to compress as a numpy array
+                     (or a pyopencl Array) of int32.
+        :type data: Union[numpy.ndarray, pyopencl.array.Array]
+        :param pyopencl.array out:
+            pyopencl array of int8 in which to store the result.
+            The array should be large enough to store the compressed data.
+        :return: The compressed data as a pyopencl array.
+                 If out is provided, this array shares the backing buffer,
+                 but has the exact size of the compressed data and the queue
+                 of the ByteOffset instance.
+        :rtype: pyopencl.array
+        :raises ValueError: if out array is not large enough
+        """
+
+        events = []
+        with self.sem:
+            if isinstance(data, pyopencl.array.Array):
+                d_data = data  # Uses provided array
+
+            else:  # Copy data to device
+                data = numpy.ascontiguousarray(data, dtype=numpy.int32).ravel()
+
+                # Make sure data array exists and is large enough
+                if ("data_input" not in self.cl_mem or
+                        self.cl_mem["data_input"].size < data.size):
+                    logger.info("increase data input buffer size to %s", data.size)
+                    self.cl_mem.update({
+                        "data_input": pyopencl.array.empty(self.queue,
+                                                           data.size,
+                                                           dtype=numpy.int32)})
+                d_data = self.cl_mem["data_input"]
+
+                evt = pyopencl.enqueue_copy(
+                    self.queue, d_data.data, data, is_blocking=False)
+                events.append(EventDescription("copy data H -> D", evt))
+
+            # Make sure compressed array exists and is large enough
+            compressed_size = d_data.size * 7
+            if ("compressed" not in self.cl_mem or
+                    self.cl_mem["compressed"].size < compressed_size):
+                logger.info("increase compressed buffer size to %s", compressed_size)
+                self.cl_mem.update({
+                    "compressed": pyopencl.array.empty(self.queue,
+                                                       compressed_size,
+                                                       dtype=numpy.int8)})
+            d_compressed = self.cl_mem["compressed"]
+            d_size = self.cl_mem["counter"]  # Shared with decompression
+
+            evt = self.kernels.compression_scan(d_data, d_compressed, d_size)
+            events.append(EventDescription("compression scan", evt))
+            byte_count = int(d_size.get()[0])
+
+            if out is None:
+                # Create out array from a sub-region of the compressed buffer
+                out = pyopencl.array.Array(
+                    self.queue,
+                    shape=(byte_count,),
+                    dtype=numpy.int8,
+                    allocator=functools.partial(
+                        d_compressed.base_data.get_sub_region,
+                        d_compressed.offset))
+
+            elif out.size < byte_count:
+                raise ValueError(
+                    "Provided output buffer is not large enough: "
+                    "requires %d bytes, got %d" % (byte_count, out.size))
+
+            else:  # out.size >= byte_count
+                # Create an array with a sub-region of out and this class queue
+                out = pyopencl.array.Array(
+                    self.queue,
+                    shape=(byte_count,),
+                    dtype=numpy.int8,
+                    allocator=functools.partial(out.base_data.get_sub_region,
+                                                out.offset))
+
+                evt = pyopencl.enqueue_copy_buffer(
+                    self.queue, d_compressed.data, out.data, byte_count=byte_count)
+                events.append(
+                    EventDescription("copy D -> D: internal -> out", evt))
+
+            if self.profile:
+                self.events += events
+
+        return out
+
+    def encode_to_bytes(self, data):
+        """Compresses data to CBF and returns compressed data as bytes.
+
+        Usage:
+
+        Provided an image (`image`) stored as a numpy array of int32,
+        first, create a byte offset compression/decompression object:
+
+        >>> from silx.opencl.codec.byte_offset import ByteOffset
+        >>> byte_offset_codec = ByteOffset()
+
+        Then, compress an image into bytes:
+
+        >>> compressed = byte_offset_codec.encode_to_bytes(image)
+
+        :param data: The data to compress as a numpy array
+                     (or a pyopencl Array) of int32.
+        :type data: Union[numpy.ndarray, pyopencl.array.Array]
+        :return: The compressed data as bytes.
+        :rtype: bytes
+        """
+        compressed_array = self.encode(data)
+        return compressed_array.get().tostring()
diff --git a/silx/opencl/codec/setup.py b/silx/opencl/codec/setup.py
new file mode 100644
index 0000000..4a5c1e5
--- /dev/null
+++ b/silx/opencl/codec/setup.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+#
+# Copyright (C) 2016-2017 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+
+from __future__ import division
+
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
+__authors__ = ["J. Kieffer"]
+__date__ = "13/10/2017"
+
+from numpy.distutils.misc_util import Configuration
+
+
+def configuration(parent_package='', top_path=None):
+    config = Configuration('codec', parent_package, top_path)
+    config.add_subpackage('test')
+    return config
+
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/silx/opencl/codec/test/__init__.py b/silx/opencl/codec/test/__init__.py
new file mode 100644
index 0000000..ec76dd3
--- /dev/null
+++ b/silx/opencl/codec/test/__init__.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: silx
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2013-2017  European Synchrotron Radiation Facility, Grenoble, France
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+__authors__ = ["J. Kieffer"]
+__license__ = "MIT"
+__date__ = "13/10/2017"
+
+import unittest
+from . import test_byte_offset
+
+
+def suite():
+    testSuite = unittest.TestSuite()
+    testSuite.addTest(test_byte_offset.suite())
+
+    return testSuite
diff --git a/silx/opencl/codec/test/test_byte_offset.py b/silx/opencl/codec/test/test_byte_offset.py
new file mode 100644
index 0000000..2bfa1d3
--- /dev/null
+++ b/silx/opencl/codec/test/test_byte_offset.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Byte-offset decompression in OpenCL
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2013-2018  European Synchrotron Radiation Facility,
+#                             Grenoble, France
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Test suite for byte-offset decompression 
+"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "10/11/2017"
+
+import sys
+import time
+import logging
+import numpy
+from silx.opencl.common import ocl, pyopencl
+from silx.opencl.codec import byte_offset
+try:
+    import fabio
+except ImportError:
+    fabio = None
+import unittest
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipUnless(ocl and fabio and pyopencl,
+                     "PyOpenCl or fabio is missing")
+class TestByteOffset(unittest.TestCase):
+
+    @staticmethod
+    def _create_test_data(shape, nexcept, lam=200):
+        """Create test (image, compressed stream) pair.
+
+        :param shape: Shape of test image
+        :param int nexcept: Number of exceptions in the image
+        :param lam: Expectation of interval argument for numpy.random.poisson
+        :return: (reference image array, compressed stream)
+        """
+        size = numpy.prod(shape)
+        ref = numpy.random.poisson(lam, numpy.prod(shape))
+        exception_loc = numpy.random.randint(0, size, size=nexcept)
+        exception_value = numpy.random.randint(0, 1000000, size=nexcept)
+        ref[exception_loc] = exception_value
+        ref.shape = shape
+
+        raw = fabio.compression.compByteOffset(ref)
+        return ref, raw
+
+    def test_decompress(self):
+        """
+        tests the byte offset decompression on GPU
+        """
+        ref, raw = self._create_test_data(shape=(91, 97), nexcept=229)
+        #ref, raw = self._create_test_data(shape=(7, 9), nexcept=0)
+        
+        size = numpy.prod(ref.shape)
+
+        try:
+            bo = byte_offset.ByteOffset(raw_size=len(raw), dec_size=size, profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            if sys.platform == "darwin":
+                raise unittest.SkipTest("Byte-offset decompression is known to be buggy on MacOS-CPU")
+            else:
+                raise err
+        print(bo.block_size)
+
+        t0 = time.time()
+        res_cy = fabio.compression.decByteOffset(raw)
+        t1 = time.time()
+        res_cl = bo.decode(raw)
+        t2 = time.time()
+        delta_cy = abs(ref.ravel() - res_cy).max()
+        delta_cl = abs(ref.ravel() - res_cl.get()).max()
+
+        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0),
+                     1000.0 * (t2 - t1))
+        bo.log_profile()
+        #print(ref)
+        #print(res_cl.get())
+        self.assertEqual(delta_cy, 0, "Checks fabio works")
+        self.assertEqual(delta_cl, 0, "Checks opencl works")
+
+    def test_many_decompress(self, ntest=10):
+        """
+        tests the byte offset decompression on GPU, many images to ensure there 
+        is not leaking in memory 
+        """
+        shape = (991, 997)
+        size = numpy.prod(shape)
+        ref, raw = self._create_test_data(shape=shape, nexcept=0, lam=100)
+
+        try:
+            bo = byte_offset.ByteOffset(len(raw), size, profile=False)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            if sys.platform == "darwin":
+                raise unittest.SkipTest("Byte-offset decompression is known to be buggy on MacOS-CPU")
+            else:
+                raise err
+        t0 = time.time()
+        res_cy = fabio.compression.decByteOffset(raw)
+        t1 = time.time()
+        res_cl = bo(raw)
+        t2 = time.time()
+        delta_cy = abs(ref.ravel() - res_cy).max()
+        delta_cl = abs(ref.ravel() - res_cl.get()).max()
+        self.assertEqual(delta_cy, 0, "Checks fabio works")
+        self.assertEqual(delta_cl, 0, "Checks opencl works")
+        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0),
+                     1000.0 * (t2 - t1))
+
+        for i in range(ntest):
+            ref, raw = self._create_test_data(shape=shape, nexcept=2729, lam=200)
+
+            t0 = time.time()
+            res_cy = fabio.compression.decByteOffset(raw)
+            t1 = time.time()
+            res_cl = bo(raw)
+            t2 = time.time()
+            delta_cy = abs(ref.ravel() - res_cy).max()
+            delta_cl = abs(ref.ravel() - res_cl.get()).max()
+            self.assertEqual(delta_cy, 0, "Checks fabio works #%i" % i)
+            self.assertEqual(delta_cl, 0, "Checks opencl works #%i" % i)
+
+            logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                         1000.0 * (t1 - t0),
+                         1000.0 * (t2 - t1))
+
+    def test_encode(self):
+        """Test byte offset compression"""
+        ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729)
+
+        try:
+            bo = byte_offset.ByteOffset(len(raw), ref.size, profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            raise err
+
+        t0 = time.time()
+        compressed_array = bo.encode(ref)
+        t1 = time.time()
+
+        compressed_stream = compressed_array.get().tostring()
+        self.assertEqual(raw, compressed_stream)
+
+        logger.debug("Global execution time: OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0))
+        bo.log_profile()
+
+    def test_encode_to_array(self):
+        """Test byte offset compression while providing an out array"""
+
+        ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729)
+
+        try:
+            bo = byte_offset.ByteOffset(profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            raise err
+        # Test with out buffer too small
+        out = pyopencl.array.empty(bo.queue, (10,), numpy.int8)
+        with self.assertRaises(ValueError):
+            bo.encode(ref, out)
+
+        # Test with out buffer too big
+        out = pyopencl.array.empty(bo.queue, (len(raw) + 10,), numpy.int8)
+
+        compressed_array = bo.encode(ref, out)
+
+        # Get size from returned array
+        compressed_size = compressed_array.size
+        self.assertEqual(compressed_size, len(raw))
+
+        # Get data from out array, read it from bo object queue
+        out_bo_queue = out.with_queue(bo.queue)
+        compressed_stream = out_bo_queue.get().tostring()[:compressed_size]
+        self.assertEqual(raw, compressed_stream)
+
+    def test_encode_to_bytes(self):
+        """Test byte offset compression to bytes"""
+        ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729)
+
+        try:
+            bo = byte_offset.ByteOffset(profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            raise err
+
+        t0 = time.time()
+        res_fabio = fabio.compression.compByteOffset(ref)
+        t1 = time.time()
+        compressed_stream = bo.encode_to_bytes(ref)
+        t2 = time.time()
+
+        self.assertEqual(raw, compressed_stream)
+
+        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0),
+                     1000.0 * (t2 - t1))
+        bo.log_profile()
+
+    def test_encode_to_bytes_from_array(self):
+        """Test byte offset compression to bytes from a pyopencl array.
+        """
+        ref, raw = self._create_test_data(shape=(2713, 2719), nexcept=2729)
+
+        try:
+            bo = byte_offset.ByteOffset(profile=True)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            raise err
+
+        d_ref = pyopencl.array.to_device(
+            bo.queue, ref.astype(numpy.int32).ravel())
+
+        t0 = time.time()
+        res_fabio = fabio.compression.compByteOffset(ref)
+        t1 = time.time()
+        compressed_stream = bo.encode_to_bytes(d_ref)
+        t2 = time.time()
+
+        self.assertEqual(raw, compressed_stream)
+
+        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0),
+                     1000.0 * (t2 - t1))
+        bo.log_profile()
+
+    def test_many_encode(self, ntest=10):
+        """Test byte offset compression with many image"""
+        shape = (991, 997)
+        ref, raw = self._create_test_data(shape=shape, nexcept=0, lam=100)
+
+        try:
+            bo = byte_offset.ByteOffset(profile=False)
+        except (RuntimeError, pyopencl.RuntimeError) as err:
+            logger.warning(err)
+            raise err
+
+        bo_durations = []
+
+        t0 = time.time()
+        res_fabio = fabio.compression.compByteOffset(ref)
+        t1 = time.time()
+        compressed_stream = bo.encode_to_bytes(ref)
+        t2 = time.time()
+        bo_durations.append(1000.0 * (t2 - t1))
+
+        self.assertEqual(raw, compressed_stream)
+        logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                     1000.0 * (t1 - t0),
+                     1000.0 * (t2 - t1))
+
+        for i in range(ntest):
+            ref, raw = self._create_test_data(shape=shape, nexcept=2729, lam=200)
+
+            t0 = time.time()
+            res_fabio = fabio.compression.compByteOffset(ref)
+            t1 = time.time()
+            compressed_stream = bo.encode_to_bytes(ref)
+            t2 = time.time()
+            bo_durations.append(1000.0 * (t2 - t1))
+
+            self.assertEqual(raw, compressed_stream)
+            logger.debug("Global execution time: fabio %.3fms, OpenCL: %.3fms.",
+                         1000.0 * (t1 - t0),
+                         1000.0 * (t2 - t1))
+
+        logger.debug("OpenCL execution time: Mean: %fms, Min: %fms, Max: %fms",
+                     numpy.mean(bo_durations),
+                     numpy.min(bo_durations),
+                     numpy.max(bo_durations))
+
+
+def suite():
+    test_suite = unittest.TestSuite()
+    test_suite.addTest(TestByteOffset("test_decompress"))
+    test_suite.addTest(TestByteOffset("test_many_decompress"))
+    test_suite.addTest(TestByteOffset("test_encode"))
+    test_suite.addTest(TestByteOffset("test_encode_to_array"))
+    test_suite.addTest(TestByteOffset("test_encode_to_bytes"))
+    test_suite.addTest(TestByteOffset("test_encode_to_bytes_from_array"))
+    test_suite.addTest(TestByteOffset("test_many_encode"))
+    return test_suite
diff --git a/silx/opencl/common.py b/silx/opencl/common.py
index 5661530..07159e2 100644
--- a/silx/opencl/common.py
+++ b/silx/opencl/common.py
@@ -34,7 +34,7 @@ __author__ = "Jerome Kieffer"
 __contact__ = "Jerome.Kieffer@ESRF.eu"
 __license__ = "MIT"
 __copyright__ = "2012-2017 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "05/10/2017"
+__date__ = "16/10/2017"
 __status__ = "stable"
 __all__ = ["ocl", "pyopencl", "mf", "release_cl_buffers", "allocate_cl_buffers",
            "measure_workgroup_size", "kernel_workgroup_size"]
@@ -407,7 +407,7 @@ class OpenCL(object):
         return None
 
     def create_context(self, devicetype="ALL", useFp64=False, platformid=None,
-                       deviceid=None, cached=True):
+                       deviceid=None, cached=True, memory=None):
         """
         Choose a device and initiate a context.
 
@@ -421,6 +421,7 @@ class OpenCL(object):
         :param platformid: integer
         :param deviceid: integer
         :param cached: True if we want to cache the context
+        :param memory: minimum amount of memory of the device
         :return: OpenCL context on the selected device
         """
         if (platformid is not None) and (deviceid is not None):
diff --git a/silx/opencl/image.py b/silx/opencl/image.py
new file mode 100644
index 0000000..65e2d5e
--- /dev/null
+++ b/silx/opencl/image.py
@@ -0,0 +1,387 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: silx
+#             https://github.com/silx-kit/silx
+#
+#    Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#  .
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#  .
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+"""A general purpose library for manipulating 2D images in 1 or 3 colors 
+
+"""
+from __future__ import absolute_import, print_function, with_statement, division
+
+
+__author__ = "Jerome Kieffer"
+__license__ = "MIT"
+__date__ = "12/02/2018"
+__copyright__ = "2012-2017, ESRF, Grenoble"
+__contact__ = "jerome.kieffer@esrf.fr"
+
+import os
+import logging
+import numpy
+from collections import OrderedDict
+from math import floor, ceil, sqrt, log
+
+from .common import pyopencl, kernel_workgroup_size
+from .processing import EventDescription, OpenclProcessing, BufferDescription
+
+if pyopencl:
+    mf = pyopencl.mem_flags
+logger = logging.getLogger(__name__)
+
+
+class ImageProcessing(OpenclProcessing):
+
+    kernel_files = ["cast", "map", "max_min", "histogram"]
+
+    converter = {numpy.dtype(numpy.uint8): "u8_to_float",
+                 numpy.dtype(numpy.int8): "s8_to_float",
+                 numpy.dtype(numpy.uint16): "u16_to_float",
+                 numpy.dtype(numpy.int16): "s16_to_float",
+                 numpy.dtype(numpy.uint32): "u32_to_float",
+                 numpy.dtype(numpy.int32): "s32_to_float",
+                 }
+
+    def __init__(self, shape=None, ncolors=1, template=None,
+                 ctx=None, devicetype="all", platformid=None, deviceid=None,
+                 block_size=None, memory=None, profile=False):
+        """Constructor of the ImageProcessing class
+
+        :param ctx: actual working context, left to None for automatic
+                    initialization from device type or platformid/deviceid
+        :param devicetype: type of device, can be "CPU", "GPU", "ACC" or "ALL"
+        :param platformid: integer with the platform_identifier, as given by clinfo
+        :param deviceid: Integer with the device identifier, as given by clinfo
+        :param block_size: preferred workgroup size, may vary depending on the
+                            out come of the compilation
+        :param memory: minimum memory available on device
+        :param profile: switch on profiling to be able to profile at the kernel
+                         level, store profiling elements (makes code slightly slower)
+        """
+        OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype,
+                                  platformid=platformid, deviceid=deviceid,
+                                  block_size=block_size, memory=memory, profile=profile)
+        if template is not None:
+            shape = template.shape
+            if len(shape) > 2:
+                self.ncolors = shape[2]
+                self.shape = shape[:2]
+            else:
+                self.ncolors = 1
+                self.shape = shape
+        else:
+            self.ncolors = ncolors
+            self.shape = shape
+            assert shape is not None
+        self.buffer_shape = self.shape if self.ncolors == 1 else self.shape + (self.ncolors,)
+        kernel_files = [os.path.join("image", i) for i in self.kernel_files]
+        self.compile_kernels(kernel_files,
+                             compile_options="-DNB_COLOR=%i" % self.ncolors)
+        if self.ncolors == 1:
+            img_shape = self.shape
+        else:
+            img_shape = self.shape + (self.ncolors,)
+
+        buffers = [BufferDescription("image0_d", img_shape, numpy.float32, None),
+                   BufferDescription("image1_d", img_shape, numpy.float32, None),
+                   BufferDescription("image2_d", img_shape, numpy.float32, None),
+                   BufferDescription("max_min_d", 2, numpy.float32, None),
+                   BufferDescription("cnt_d", 1, numpy.int32, None), ]
+        # Temporary buffer for max-min reduction
+        self.wg_red = kernel_workgroup_size(self.program, self.kernels.max_min_reduction_stage1)
+        if self.wg_red > 1:
+            self.wg_red = min(self.wg_red,
+                              numpy.int32(1 << int(floor(log(sqrt(numpy.prod(self.shape)), 2)))))
+            tmp = BufferDescription("tmp_max_min_d", 2 * self.wg_red, numpy.float32, None)
+            buffers.append(tmp)
+        self.allocate_buffers(buffers, use_array=True)
+        self.cl_mem["cnt_d"].fill(0)
+
+    def __repr__(self):
+        return "ImageProcessing for shape=%s, %i colors initalized on %s" % \
+            (self.shape, self.ncolors, self.ctx.devices[0].name)
+
+    def _get_in_out_buffers(self, img=None, copy=True, out=None,
+                            out_dtype=None, out_size=None):
+        """Internal method used to select the proper buffers before processing.
+
+        :param img: expects a numpy array or a pyopencl.array of dim 2 or 3
+        :param copy: set to False to directly re-use a pyopencl array
+        :param out: provide an output buffer to store the result
+        :param out_dtype: enforce the type of the output buffer (optional)
+        :param out_size: enforce the size of the output buffer (optional)
+        :return: input_buffer, output_buffer
+        
+        Nota: this is not locked.
+        """
+        events = []
+        if out is not None and isinstance(out, pyopencl.array.Array):
+            if (out_size or out_dtype) is not None:
+                if out_size is not None:
+                    assert out.size > out_size
+                if out_dtype is not None:
+                    assert out_dtype == out.dtype
+            else:  # assume it is same size and type as weoking buffer
+                assert out.shape == self.buffer_shape
+                assert out.dtype == numpy.float32
+            out.finish()
+            output_array = out
+        else:
+            if out_dtype != numpy.float32 and out_size:
+                name = "%s_%s_d" % (numpy.dtype(out_dtype), out_size)
+                if name not in self.cl_mem:
+                    output_array = self.cl_mem[name] = pyopencl.array.empty(self.queue, (out_size,), out_dtype)
+                else:
+                    output_array = self.cl_mem[name]
+            else:
+                output_array = self.cl_mem["image2_d"]
+
+        if img is None:
+            input_array = self.cl_mem["image1_d"]
+        if isinstance(img, pyopencl.array.Array):
+            if copy:
+                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, img.data)
+                input_array = self.cl_mem["image1_d"]
+                events.append(EventDescription("copy D->D", evt))
+            else:
+                img.finish()
+                input_array = img
+                evt = None
+        else:
+            # assume this is numpy
+            if img.dtype.itemsize > 4:
+                logger.warning("Casting to float32 on CPU")
+                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, numpy.ascontiguousarray(img, numpy.float32))
+                input_array = self.cl_mem["image1_d"]
+                events.append(EventDescription("cast+copy H->D", evt))
+            else:
+                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["image1_d"].data, numpy.ascontiguousarray(img))
+                input_array = self.cl_mem["image1_d"]
+                events.append(EventDescription("copy H->D", evt))
+        if self.profile:
+            self.events += events
+        return input_array, output_array
+
+    def to_float(self, img, copy=True, out=None):
+        """ Takes any array and convert it to a float array for ease of processing.
+        
+        :param img: expects a numpy array or a pyopencl.array of dim 2 or 3
+        :param copy: set to False to directly re-use a pyopencl array
+        :param out: provide an output buffer to store the result
+        """
+        assert img.shape == self.buffer_shape
+
+        events = []
+        with self.sem:
+            input_array, output_array = self._get_in_out_buffers(img, copy, out)
+            if (img.dtype.itemsize > 4) or (img.dtype == numpy.float32):
+                # copy device -> device, already there as float32
+                ev = pyopencl.enqueue_copy(self.queue, output_array.data, input_array.data)
+                events.append(EventDescription("copy D->D", ev))
+            else:
+                # Cast to float:
+                name = self.converter[img.dtype]
+                kernel = self.kernels.get_kernel(name)
+                ev = kernel(self.queue, (self.shape[1], self.shape[0]), None,
+                            input_array.data, output_array.data,
+                            numpy.int32(self.shape[1]), numpy.int32(self.shape[0])
+                            )
+                events.append(EventDescription("cast %s" % name, ev))
+
+        if self.profile:
+            self.events += events
+        if out is None:
+            res = output_array.get()
+            return res
+        else:
+            output_array.finish()
+            return output_array
+
+    def normalize(self, img, mini=0.0, maxi=1.0, copy=True, out=None):
+        """Scale the intensity of the image so that the minimum is 0 and the
+        maximum is 1.0 (or any value suggested).
+        
+        :param img: numpy array or pyopencl array of dim 2 or 3 and of type float
+        :param mini: Expected minimum value
+        :param maxi: expected maxiumum value
+        :param copy: set to False to use directly the input buffer
+        :param out: provides an output buffer. prevents a copy D->H
+        
+        This uses a min/max reduction in two stages plus a map operation  
+        """
+        assert img.shape == self.buffer_shape
+        events = []
+        with self.sem:
+            input_array, output_array = self._get_in_out_buffers(img, copy, out)
+            size = numpy.int32(numpy.prod(self.shape))
+            if self.wg_red == 1:
+                #  Probably on MacOS CPU WG==1 --> serial code.
+                kernel = self.kernels.get_kernel("max_min_serial")
+                evt = kernel(self.queue, (1,), (1,),
+                             input_array.data,
+                             size,
+                             self.cl_mem["max_min_d"].data)
+                ed = EventDescription("max_min_serial", evt)
+                events.append(ed)
+            else:
+                stage1 = self.kernels.max_min_reduction_stage1
+                stage2 = self.kernels.max_min_reduction_stage2
+                local_mem = pyopencl.LocalMemory(int(self.wg_red * 8))
+                k1 = stage1(self.queue, (int(self.wg_red ** 2),), (int(self.wg_red),),
+                            input_array.data,
+                            self.cl_mem["tmp_max_min_d"].data,
+                            size,
+                            local_mem)
+                k2 = stage2(self.queue, (int(self.wg_red),), (int(self.wg_red),),
+                            self.cl_mem["tmp_max_min_d"].data,
+                            self.cl_mem["max_min_d"].data,
+                            local_mem)
+
+                events += [EventDescription("max_min_stage1", k1),
+                           EventDescription("max_min_stage2", k2)]
+
+            evt = self.kernels.normalize_image(self.queue, (self.shape[1], self.shape[0]), None,
+                                               input_array.data, output_array.data,
+                                               numpy.int32(self.shape[1]), numpy.int32(self.shape[0]),
+                                               self.cl_mem["max_min_d"].data,
+                                               numpy.float32(mini), numpy.float32(maxi))
+            events.append(EventDescription("normalize", evt))
+        if self.profile:
+            self.events += events
+
+        if out is None:
+            res = output_array.get()
+            return res
+        else:
+            output_array.finish()
+            return output_array
+
+    def histogram(self, img=None, nbins=255, range=None,
+                  log_scale=False, copy=True, out=None):
+        """Compute the histogram of a set of data.
+        
+        :param img: input image. If None, use the one already on the device
+        :param nbins: number of bins
+        :param range: the lower and upper range of the bins.  If not provided, 
+                    range is simply ``(a.min(), a.max())``.  Values outside the 
+                    range are ignored. The first element of the range must be 
+                    less than or equal to the second.
+        :param log_scale: perform the binning in lograrithmic scale. 
+                         Open to extension
+        :param copy: unset to directly use the input buffer without copy
+        :param out: use a provided array for offering the result 
+        :return: histogram (size=nbins), edges (size=nbins+1)
+        API similar to numpy  
+        """
+        assert img.shape == self.buffer_shape
+
+        input_array = self.to_float(img, copy=copy, out=self.cl_mem["image0_d"])
+        events = []
+        with self.sem:
+            input_array, output_array = self._get_in_out_buffers(input_array, copy=False,
+                                                                 out=out,
+                                                                 out_dtype=numpy.int32,
+                                                                 out_size=nbins)
+
+            if range is None:
+                # measure actually the bounds
+                size = numpy.int32(numpy.prod(self.shape))
+                if self.wg_red == 1:
+                    #  Probably on MacOS CPU WG==1 --> serial code.
+                    kernel = self.kernels.get_kernel("max_min_serial")
+
+                    evt = kernel(self.queue, (1,), (1,),
+                                 input_array.data,
+                                 size,
+                                 self.cl_mem["max_min_d"].data)
+                    events.append(EventDescription("max_min_serial", evt))
+                else:
+                    stage1 = self.kernels.max_min_reduction_stage1
+                    stage2 = self.kernels.max_min_reduction_stage2
+                    local_mem = pyopencl.LocalMemory(int(self.wg_red * 2 * numpy.dtype("float32").itemsize))
+                    k1 = stage1(self.queue, (int(self.wg_red ** 2),), (int(self.wg_red),),
+                                input_array.data,
+                                self.cl_mem["tmp_max_min_d"].data,
+                                size,
+                                local_mem)
+                    k2 = stage2(self.queue, (int(self.wg_red),), (int(self.wg_red),),
+                                self.cl_mem["tmp_max_min_d"].data,
+                                self.cl_mem["max_min_d"].data,
+                                local_mem)
+
+                    events += [EventDescription("max_min_stage1", k1),
+                               EventDescription("max_min_stage2", k2)]
+                maxi, mini = self.cl_mem["max_min_d"].get()
+            else:
+                mini = numpy.float32(min(range))
+                maxi = numpy.float32(max(range))
+            device = self.ctx.devices[0]
+            nb_engines = device.max_compute_units
+            tmp_size = nb_engines * nbins
+            name = "tmp_int32_%s_d" % (tmp_size)
+            if name not in self.cl_mem:
+                tmp_array = self.cl_mem[name] = pyopencl.array.empty(self.queue, (tmp_size,), numpy.int32)
+            else:
+                tmp_array = self.cl_mem[name]
+
+            edge_name = "tmp_float32_%s_d" % (nbins + 1)
+            if edge_name not in self.cl_mem:
+                edges_array = self.cl_mem[edge_name] = pyopencl.array.empty(self.queue, (nbins + 1,), numpy.float32)
+            else:
+                edges_array = self.cl_mem[edge_name]
+
+            shared = pyopencl.LocalMemory(numpy.dtype(numpy.int32).itemsize * nbins)
+
+            # Handle log-scale
+            if log_scale:
+                map_operation = numpy.int32(1)
+            else:
+                map_operation = numpy.int32(0)
+            kernel = self.kernels.get_kernel("histogram")
+            wg = min(device.max_work_group_size,
+                     1 << (int(ceil(log(nbins, 2)))),
+                     self.kernels.max_workgroup_size(kernel))
+            evt = kernel(self.queue, (wg * nb_engines,), (wg,),
+                         input_array.data,
+                         numpy.int32(input_array.size),
+                         mini,
+                         maxi,
+                         map_operation,
+                         output_array.data,
+                         edges_array.data,
+                         numpy.int32(nbins),
+                         tmp_array.data,
+                         self.cl_mem["cnt_d"].data,
+                         shared)
+            events.append(EventDescription("histogram", evt))
+
+        if self.profile:
+            self.events += events
+
+        if out is None:
+            res = output_array.get()
+            return res, edges_array.get()
+        else:
+            output_array.finish()
+            return output_array, edges_array
diff --git a/silx/opencl/processing.py b/silx/opencl/processing.py
index 1997a55..250582d 100644
--- a/silx/opencl/processing.py
+++ b/silx/opencl/processing.py
@@ -4,7 +4,7 @@
 #    Project: S I L X project
 #             https://github.com/silx-kit/silx
 #
-#    Copyright (C) 2012-2017 European Synchrotron Radiation Facility, Grenoble, France
+#    Copyright (C) 2012-2018 European Synchrotron Radiation Facility, Grenoble, France
 #
 #    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
 #
@@ -31,7 +31,7 @@
 #
 
 """
-Common OpenCL abstract base classes for different processing
+Common OpenCL abstract base classe for different processing
 """
 
 from __future__ import absolute_import, print_function, division
@@ -41,7 +41,7 @@ __author__ = "Jerome Kieffer"
 __contact__ = "Jerome.Kieffer@ESRF.eu"
 __license__ = "MIT"
 __copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "03/10/2017"
+__date__ = "27/02/2018"
 __status__ = "stable"
 
 
@@ -69,17 +69,29 @@ class KernelContainer(object):
 
         :param program: the OpenCL program as generated by PyOpenCL
         """
+        self._program = program
         for kernel in program.all_kernels():
             self.__setattr__(kernel.function_name, kernel)
 
     def get_kernels(self):
         "return the dictionary with all kernels"
-        return self.__dict__.copy()
+        return dict(item for item in self.__dict__.items()
+                    if not item[0].startswith("_"))
 
     def get_kernel(self, name):
         "get a kernel from its name"
+        logger.debug("KernelContainer.get_kernel(%s)", name)
         return self.__dict__.get(name)
 
+    def max_workgroup_size(self, kernel_name):
+        "Retrieve the compile time max_workgroup_size for a given kernel"
+        if isinstance(kernel_name, pyopencl.Kernel):
+            kernel = kernel_name
+        else:
+            kernel = self.get_kernel(kernel_name)
+
+        return kernel_workgroup_size(self._program, kernel)
+
 
 class OpenclProcessing(object):
     """Abstract class for different types of OpenCL processing.
@@ -97,7 +109,7 @@ class OpenclProcessing(object):
     kernel_files = []
 
     def __init__(self, ctx=None, devicetype="all", platformid=None, deviceid=None,
-                 block_size=None, profile=False):
+                 block_size=None, memory=None, profile=False):
         """Constructor of the abstract OpenCL processing class
 
         :param ctx: actual working context, left to None for automatic
@@ -107,6 +119,7 @@ class OpenclProcessing(object):
         :param deviceid: Integer with the device identifier, as given by clinfo
         :param block_size: preferred workgroup size, may vary depending on the
                             out come of the compilation
+        :param memory: minimum memory available on device
         :param profile: switch on profiling to be able to profile at the kernel
                          level, store profiling elements (makes code slightly slower)
         """
@@ -116,10 +129,13 @@ class OpenclProcessing(object):
         self.cl_mem = {}  # dict with all buffer allocated
         self.cl_program = None  # The actual OpenCL program
         self.cl_kernel_args = {}  # dict with all kernel arguments
+        self.queue = None
         if ctx:
             self.ctx = ctx
         else:
-            self.ctx = ocl.create_context(devicetype=devicetype, platformid=platformid, deviceid=deviceid)
+            self.ctx = ocl.create_context(devicetype=devicetype,
+                                          platformid=platformid, deviceid=deviceid,
+                                          memory=memory)
         device_name = self.ctx.devices[0].name.strip()
         platform_name = self.ctx.devices[0].platform.name.strip()
         platform = ocl.get_platform(platform_name)
@@ -134,19 +150,23 @@ class OpenclProcessing(object):
     def __del__(self):
         """Destructor: release all buffers and programs
         """
+        self.reset_log()
         self.free_kernels()
         self.free_buffers()
         self.queue = None
+        self.device = None
         self.ctx = None
         gc.collect()
 
-    def allocate_buffers(self, buffers=None):
+    def allocate_buffers(self, buffers=None, use_array=False):
         """
         Allocate OpenCL buffers required for a specific configuration
 
         :param buffers: a list of BufferDescriptions, leave to None for
                         paramatrized buffers.
-
+        :param use_array: allocate memory as pyopencl.array.Array
+                            instead of pyopencl.Buffer
+        
         Note that an OpenCL context also requires some memory, as well
         as Event and other OpenCL functionalities which cannot and are
         not taken into account here.  The memory required by a context
@@ -166,7 +186,7 @@ class OpenclProcessing(object):
             # check if enough memory is available on the device
             ualloc = 0
             for buf in buffers:
-                ualloc += numpy.dtype(buf.dtype).itemsize * buf.size
+                ualloc += numpy.dtype(buf.dtype).itemsize * numpy.prod(buf.size)
             logger.info("%.3fMB are needed on device: %s,  which has %.3fMB",
                         ualloc / 1.0e6, self.device, self.device.memory / 1.0e6)
 
@@ -177,9 +197,13 @@ class OpenclProcessing(object):
 
             # do the allocation
             try:
-                for buf in buffers:
-                    size = numpy.dtype(buf.dtype).itemsize * buf.size
-                    mem[buf.name] = pyopencl.Buffer(self.ctx, buf.flags, int(size))
+                if use_array:
+                    for buf in buffers:
+                        mem[buf.name] = pyopencl.array.empty(self.queue, buf.size, buf.dtype)
+                else:
+                    for buf in buffers:
+                        size = numpy.dtype(buf.dtype).itemsize * numpy.prod(buf.size)
+                        mem[buf.name] = pyopencl.Buffer(self.ctx, buf.flags, int(size))
             except pyopencl.MemoryError as error:
                 release_cl_buffers(mem)
                 raise MemoryError(error)
@@ -199,8 +223,8 @@ class OpenclProcessing(object):
         self.cl_mem.update(mem)
 
     def check_workgroup_size(self, kernel_name):
-        kernel = self.kernels.get_kernel(kernel_name)
-        self.compiletime_workgroup_size = kernel_workgroup_size(self.program, kernel)
+        "Calculate the maximum workgroup size from given kernel after compilation"
+        return self.kernels.max_workgroup_size(kernel_name)
 
     def free_buffers(self):
         """free all device.memory allocated on the device
@@ -282,6 +306,13 @@ class OpenclProcessing(object):
         logger.info(os.linesep.join(out))
         return out
 
+    def reset_log(self):
+        """
+        Resets the profiling timers
+        """
+        with self.sem:
+            self.events = []
+
 # This should be implemented by concrete class
 #     def __copy__(self):
 #         """Shallow copy of the object
diff --git a/silx/opencl/projection.py b/silx/opencl/projection.py
index 0ebe9bc..0505d80 100644
--- a/silx/opencl/projection.py
+++ b/silx/opencl/projection.py
@@ -29,7 +29,7 @@ from __future__ import absolute_import, print_function, with_statement, division
 
 __authors__ = ["A. Mirone, P. Paleo"]
 __license__ = "MIT"
-__date__ = "26/06/2017"
+__date__ = "28/02/2018"
 
 import logging
 import numpy as np
@@ -52,6 +52,7 @@ class Projection(OpenclProcessing):
     OpenCL
     """
     kernel_files = ["proj.cl", "array_utils.cl"]
+    logger.warning("Forward Projecter is untested and unsuported for now")
 
     def __init__(self, slice_shape, angles, axis_position=None,
                  detector_width=None, normalize=False, ctx=None,
@@ -111,10 +112,10 @@ class Projection(OpenclProcessing):
                                       endpoint=False).astype(dtype=np.float32)
         else:
             self.nprojs = len(self.angles)
-        self.offset_x = -np.float32((self.shape[1]-1)/2. - self.axis_pos) # TODO: custom
-        self.offset_y = -np.float32((self.shape[0]-1)/2. - self.axis_pos) # TODO: custom
+        self.offset_x = -np.float32((self.shape[1] - 1) / 2. - self.axis_pos)  # TODO: custom
+        self.offset_y = -np.float32((self.shape[0] - 1) / 2. - self.axis_pos)  # TODO: custom
         # Reset axis_pos once offset are computed
-        self.axis_pos0 = np.float((self.shape[1]-1)/2.)
+        self.axis_pos0 = np.float((self.shape[1] - 1) / 2.)
 
         # Workgroup, ndrange and shared size
         self.dimgrid_x = _idivup(self.dwidth, 16)
@@ -125,7 +126,7 @@ class Projection(OpenclProcessing):
         self.wg = (16, 16)
         self.ndrange = (
             int(self.dimgrid_x) * self.wg[0],  # int(): pyopencl <= 2015.1
-            int(self.dimgrid_y) * self.wg[1]   # int(): pyopencl <= 2015.1
+            int(self.dimgrid_y) * self.wg[1]  # int(): pyopencl <= 2015.1
         )
 
         self.is_cpu = False
@@ -146,7 +147,7 @@ class Projection(OpenclProcessing):
                                                    self.nprojs, np.float32)
             }
         )
-        self._tmp_extended_img = np.zeros((self.shape[0]+2, self.shape[1]+2),
+        self._tmp_extended_img = np.zeros((self.shape[0] + 2, self.shape[1] + 2),
                                           dtype=np.float32)
         if self.is_cpu:
             self.allocate_slice()
@@ -158,41 +159,41 @@ class Projection(OpenclProcessing):
         if self.is_cpu:
             self.cl_mem["d_slice"].fill(0.)
             # enqueue_fill_buffer has issues if opencl 1.2 is not present
-            #~ pyopencl.enqueue_fill_buffer(
-                #~ self.queue,
-                #~ self.cl_mem["d_slice"],
-                #~ np.float32(0),
-                #~ 0,
-                #~ self._tmp_extended_img.size * _sizeof(np.float32)
-            #~ )
+            # ~ pyopencl.enqueue_fill_buffer(
+                # ~ self.queue,
+                # ~ self.cl_mem["d_slice"],
+                # ~ np.float32(0),
+                # ~ 0,
+                # ~ self._tmp_extended_img.size * _sizeof(np.float32)
+            # ~ )
         # Precomputations
         self.compute_angles()
         self.proj_precomputations()
         self.cl_mem["d_axis_corrections"].fill(0.)
         # enqueue_fill_buffer has issues if opencl 1.2 is not present
-        #~ pyopencl.enqueue_fill_buffer(
-                                    #~ self.queue,
-                                    #~ self.cl_mem["d_axis_corrections"],
-                                    #~ np.float32(0),
-                                    #~ 0,
-                                    #~ self.nprojs*_sizeof(np.float32)
-                                    #~ )
+        # ~ pyopencl.enqueue_fill_buffer(
+                                    # ~ self.queue,
+                                    # ~ self.cl_mem["d_axis_corrections"],
+                                    # ~ np.float32(0),
+                                    # ~ 0,
+                                    # ~ self.nprojs*_sizeof(np.float32)
+                                    # ~ )
         # Shorthands
         self._d_sino = self.cl_mem["_d_sino"]
 
         OpenclProcessing.compile_kernels(self, self.kernel_files)
         # check that workgroup can actually be (16, 16)
-        self.check_workgroup_size("forward_kernel_cpu")
+        self.compiletime_workgroup_size = self.kernels.max_workgroup_size("forward_kernel_cpu")
 
     def compute_angles(self):
-        angles2 = np.zeros(self._dimrecy, dtype=np.float32) # dimrecy != num_projs
+        angles2 = np.zeros(self._dimrecy, dtype=np.float32)  # dimrecy != num_projs
         angles2[:self.nprojs] = np.copy(self.angles)
-        angles2[self.nprojs:] = angles2[self.nprojs-1]
+        angles2[self.nprojs:] = angles2[self.nprojs - 1]
         self.angles2 = angles2
         pyopencl.enqueue_copy(self.queue, self.cl_mem["d_angles"], angles2)
 
     def allocate_slice(self):
-            self.add_to_cl_mem({"d_slice": parray.zeros(self.queue, (self.shape[1]+2, self.shape[1]+2), np.float32)})
+            self.add_to_cl_mem({"d_slice": parray.zeros(self.queue, (self.shape[1] + 2, self.shape[1] + 2), np.float32)})
 
     def allocate_textures(self):
         self.d_image_tex = pyopencl.Image(
@@ -211,13 +212,13 @@ class Projection(OpenclProcessing):
         if self.is_cpu:
             # TODO: create NoneEvent
             return self.transfer_to_slice(image2)
-            #~ return pyopencl.enqueue_copy(
-                        #~ self.queue,
-                        #~ self.cl_mem["d_slice"].data,
-                        #~ image2,
-                        #~ origin=(1, 1),
-                        #~ region=image.shape[::-1]
-                        #~ )
+            # ~ return pyopencl.enqueue_copy(
+                        # ~ self.queue,
+                        # ~ self.cl_mem["d_slice"].data,
+                        # ~ image2,
+                        # ~ origin=(1, 1),
+                        # ~ region=image.shape[::-1]
+                        # ~ )
         else:
             return pyopencl.enqueue_copy(
                        self.queue,
@@ -238,11 +239,11 @@ class Projection(OpenclProcessing):
                 d_image,
                 offset=0,
                 origin=(1, 1),
-                region=(int(self.shape[1]), int(self.shape[0]))#self.shape[::-1] # pyopencl <= 2015.2
+                region=(int(self.shape[1]), int(self.shape[0]))  # self.shape[::-1] # pyopencl <= 2015.2
             )
 
     def transfer_to_slice(self, image):
-        image2 = np.zeros((image.shape[0]+2, image.shape[1]+2), dtype=np.float32)
+        image2 = np.zeros((image.shape[0] + 2, image.shape[1] + 2), dtype=np.float32)
         image2[1:-1, 1:-1] = image.astype(np.float32)
         self.cl_mem["d_slice"].set(image2)
 
@@ -272,14 +273,14 @@ class Projection(OpenclProcessing):
         strideLine[0][case1] = 0
         strideLine[1][case1] = 1
 
-        beginPos[0][case2] = dimslice-1
-        beginPos[1][case2] = dimslice-1
+        beginPos[0][case2] = dimslice - 1
+        beginPos[1][case2] = dimslice - 1
         strideJoseph[0][case2] = -1
         strideJoseph[1][case2] = 0
         strideLine[0][case2] = 0
         strideLine[1][case2] = -1
 
-        beginPos[0][case3] = dimslice-1
+        beginPos[0][case3] = dimslice - 1
         beginPos[1][case3] = 0
         strideJoseph[0][case3] = 0
         strideJoseph[1][case3] = 1
@@ -287,16 +288,16 @@ class Projection(OpenclProcessing):
         strideLine[1][case3] = 0
 
         beginPos[0][case4] = 0
-        beginPos[1][case4] = dimslice-1
+        beginPos[1][case4] = dimslice - 1
         strideJoseph[0][case4] = 0
         strideJoseph[1][case4] = -1
         strideLine[0][case4] = 1
         strideLine[1][case4] = 0
 
         # For debug purpose
-        #~ self.beginPos = beginPos
-        #~ self.strideJoseph = strideJoseph
-        #~ self.strideLine = strideLine
+        # ~ self.beginPos = beginPos
+        # ~ self.strideJoseph = strideJoseph
+        # ~ self.strideLine = strideLine
         #
 
         pyopencl.enqueue_copy(self.queue, self.cl_mem["d_beginPos"], beginPos)
@@ -307,7 +308,7 @@ class Projection(OpenclProcessing):
         return pyopencl.LocalMemory(self.local_mem)  # constant for all image sizes
 
     def cpy2d_to_sino(self, dst):
-        ndrange = (int(self.dwidth), int(self.nprojs)) # pyopencl < 2015.2
+        ndrange = (int(self.dwidth), int(self.nprojs))  # pyopencl < 2015.2
         sino_shape_ocl = np.int32(ndrange)
         wg = None
         kernel_args = (
@@ -325,13 +326,13 @@ class Projection(OpenclProcessing):
         """
         copy a Nx * Ny slice to self.d_slice which is (Nx+2)*(Ny+2)
         """
-        ndrange = (int(self.shape[1]), int(self.shape[0])) #self.shape[::-1] # pyopencl < 2015.2
+        ndrange = (int(self.shape[1]), int(self.shape[0]))  # self.shape[::-1] # pyopencl < 2015.2
         wg = None
         slice_shape_ocl = np.int32(ndrange)
         kernel_args = (
             self.cl_mem["d_slice"].data,
             src,
-            np.int32(self.shape[1]+2),
+            np.int32(self.shape[1] + 2),
             np.int32(self.shape[1]),
             np.int32((1, 1)),
             np.int32((0, 0)),
@@ -413,7 +414,7 @@ class Projection(OpenclProcessing):
         # /with self.sem
         if self.profile:
             self.events += events
-        #~ res = self._ex_sino
+        # ~ res = self._ex_sino
         return res
 
     __call__ = projection
diff --git a/silx/opencl/setup.py b/silx/opencl/setup.py
index a2ae244..10fb1be 100644
--- a/silx/opencl/setup.py
+++ b/silx/opencl/setup.py
@@ -27,7 +27,7 @@ __contact__ = "jerome.kieffer@esrf.eu"
 __license__ = "MIT"
 __copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
 __authors__ = ["J. Kieffer"]
-__date__ = "16/08/2017"
+__date__ = "16/10/2017"
 
 import os.path
 from numpy.distutils.misc_util import Configuration
@@ -38,6 +38,7 @@ def configuration(parent_package='', top_path=None):
     path = os.path.dirname(os.path.abspath(__file__))
     if os.path.exists(os.path.join(path, 'sift')):
         config.add_subpackage('sift')
+    config.add_subpackage('codec')
     config.add_subpackage('test')
     return config
 
diff --git a/silx/opencl/test/__init__.py b/silx/opencl/test/__init__.py
index f6aadcd..2fe88ea 100644
--- a/silx/opencl/test/__init__.py
+++ b/silx/opencl/test/__init__.py
@@ -24,7 +24,7 @@
 
 __authors__ = ["J. Kieffer"]
 __license__ = "MIT"
-__date__ = "01/09/2017"
+__date__ = "17/10/2017"
 
 import os
 import unittest
@@ -34,6 +34,8 @@ from . import test_backprojection
 from . import test_projection
 from . import test_linalg
 from . import test_array_utils
+from ..codec import test as test_codec
+from . import test_image
 
 def suite():
     test_suite = unittest.TestSuite()
@@ -43,7 +45,8 @@ def suite():
     test_suite.addTests(test_projection.suite())
     test_suite.addTests(test_linalg.suite())
     test_suite.addTests(test_array_utils.suite())
-
+    test_suite.addTests(test_codec.suite())
+    test_suite.addTests(test_image.suite())
     # Allow to remove sift from the project
     test_base_dir = os.path.dirname(__file__)
     sift_dir = os.path.join(test_base_dir, "..", "sift")
diff --git a/silx/opencl/test/test_addition.py b/silx/opencl/test/test_addition.py
index 89e49be..cde86e2 100644
--- a/silx/opencl/test/test_addition.py
+++ b/silx/opencl/test/test_addition.py
@@ -35,7 +35,7 @@ __authors__ = ["Henri Payno, Jérôme Kieffer"]
 __contact__ = "jerome.kieffer@esrf.eu"
 __license__ = "MIT"
 __copyright__ = "2013 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "15/03/2017"
+__date__ = "12/02/2018"
 
 import logging
 import numpy
@@ -108,7 +108,7 @@ class TestAddition(unittest.TestCase):
             else:
                 res = d_array_result.get()
                 good = numpy.allclose(res, self.data - 5)
-                if good and wg>self.max_valid_wg:
+                if good and wg > self.max_valid_wg:
                     self.__class__.max_valid_wg = wg
                 self.assert_(good, "calculation is correct for WG=%s" % wg)
 
@@ -127,7 +127,7 @@ class TestAddition(unittest.TestCase):
 def suite():
     testSuite = unittest.TestSuite()
     testSuite.addTest(TestAddition("test_add"))
-    testSuite.addTest(TestAddition("test_measurement"))
+    # testSuite.addTest(TestAddition("test_measurement"))
     return testSuite
 
 
diff --git a/silx/opencl/test/test_backprojection.py b/silx/opencl/test/test_backprojection.py
index 342bd2f..70ce2ae 100644
--- a/silx/opencl/test/test_backprojection.py
+++ b/silx/opencl/test/test_backprojection.py
@@ -30,7 +30,7 @@ from __future__ import division, print_function
 __authors__ = ["Pierre paleo"]
 __license__ = "MIT"
 __copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "05/10/2017"
+__date__ = "19/01/2018"
 
 
 import time
@@ -89,7 +89,7 @@ class TestFBP(unittest.TestCase):
             # ~ self.skipTest("Backprojection is not implemented on CPU for OS X yet")
         self.getfiles()
         self.fbp = backprojection.Backprojection(self.sino.shape, profile=True)
-        if self.fbp.compiletime_workgroup_size < 16:
+        if self.fbp.compiletime_workgroup_size < 16 * 16:
             self.skipTest("Current implementation of OpenCL backprojection is not supported on this platform yet")
 
     def tearDown(self):
diff --git a/silx/opencl/test/test_image.py b/silx/opencl/test/test_image.py
new file mode 100644
index 0000000..d73a854
--- /dev/null
+++ b/silx/opencl/test/test_image.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: image manipulation in  OpenCL
+#             https://github.com/silx-kit/silx
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+"""
+Simple test of image manipulation
+"""
+
+from __future__ import division, print_function
+
+__authors__ = ["Jérôme Kieffer"]
+__contact__ = "jerome.kieffer@esrf.eu"
+__license__ = "MIT"
+__copyright__ = "2017 European Synchrotron Radiation Facility, Grenoble, France"
+__date__ = "13/02/2018"
+
+import logging
+import numpy
+
+import unittest
+from ..common import ocl, _measure_workgroup_size
+if ocl:
+    import pyopencl
+    import pyopencl.array
+from ...test.utils import utilstest
+from ..image import ImageProcessing
+logger = logging.getLogger(__name__)
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+
+
+@unittest.skipUnless(ocl and Image, "PyOpenCl/Image is missing")
+class TestImage(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        super(TestImage, cls).setUpClass()
+        if ocl:
+            cls.ctx = ocl.create_context()
+            cls.lena = utilstest.getfile("lena.png")
+            cls.data = numpy.asarray(Image.open(cls.lena))
+            cls.ip = ImageProcessing(ctx=cls.ctx, template=cls.data, profile=True)
+
+    @classmethod
+    def tearDownClass(cls):
+        super(TestImage, cls).tearDownClass()
+        cls.ctx = None
+        cls.lena = None
+        cls.data = None
+        if logger.level <= logging.INFO:
+            logger.warning("\n".join(cls.ip.log_profile()))
+        cls.ip = None
+
+    def setUp(self):
+        if ocl is None:
+            return
+        self.data = numpy.asarray(Image.open(self.lena))
+
+    def tearDown(self):
+        self.img = self.data = None
+
+    @unittest.skipUnless(ocl, "pyopencl is missing")
+    def test_cast(self):
+        """
+        tests the cast kernel
+        """
+        res = self.ip.to_float(self.data)
+        self.assertEqual(res.shape, self.data.shape, "shape")
+        self.assertEqual(res.dtype, numpy.float32, "dtype")
+        self.assertEqual(abs(res - self.data).max(), 0, "content")
+
+    @unittest.skipUnless(ocl, "pyopencl is missing")
+    def test_normalize(self):
+        """
+        tests that all devices are working properly ...
+        """
+        tmp = pyopencl.array.empty(self.ip.ctx, self.data.shape, "float32")
+        res = self.ip.to_float(self.data, out=tmp)
+        res2 = self.ip.normalize(tmp, -100, 100, copy=False)
+        norm = (self.data.astype(numpy.float32) - self.data.min()) / (self.data.max() - self.data.min())
+        ref2 = 200 * norm - 100
+        self.assertLess(abs(res2 - ref2).max(), 3e-5, "content")
+
+    @unittest.skipUnless(ocl, "pyopencl is missing")
+    def test_histogram(self):
+        """
+        Test on a greyscaled image ... of Lena :)
+        """
+        lena_bw = (0.2126 * self.data[:, :, 0] +
+                   0.7152 * self.data[:, :, 1] +
+                   0.0722 * self.data[:, :, 2]).astype("int32")
+        ref = numpy.histogram(lena_bw, 255)
+        ip = ImageProcessing(ctx=self.ctx, template=lena_bw, profile=True)
+        res = ip.histogram(lena_bw, 255)
+        ip.log_profile()
+        delta = (ref[0] - res[0])
+        deltap = (ref[1] - res[1])
+        self.assertEqual(delta.sum(), 0, "errors are self-compensated")
+        self.assertLessEqual(abs(delta).max(), 1, "errors are small")
+        self.assertLessEqual(abs(deltap).max(), 3e-5, "errors on position are small: %s" % (abs(deltap).max()))
+
+
+def suite():
+    testSuite = unittest.TestSuite()
+    testSuite.addTest(TestImage("test_cast"))
+    testSuite.addTest(TestImage("test_normalize"))
+    testSuite.addTest(TestImage("test_histogram"))
+    return testSuite
+
+
+if __name__ == '__main__':
+    unittest.main(defaultTest="suite")
diff --git a/silx/opencl/test/test_projection.py b/silx/opencl/test/test_projection.py
index c9a3a1c..7631128 100644
--- a/silx/opencl/test/test_projection.py
+++ b/silx/opencl/test/test_projection.py
@@ -30,7 +30,7 @@ from __future__ import division, print_function
 __authors__ = ["Pierre paleo"]
 __license__ = "MIT"
 __copyright__ = "2013-2017 European Synchrotron Radiation Facility, Grenoble, France"
-__date__ = "14/06/2017"
+__date__ = "19/01/2018"
 
 
 import time
@@ -49,35 +49,31 @@ from silx.test.utils import utilstest
 logger = logging.getLogger(__name__)
 
 
-
 @unittest.skipUnless(ocl and mako, "PyOpenCl is missing")
 class TestProj(unittest.TestCase):
 
     def setUp(self):
         if ocl is None:
             return
-        #~ if sys.platform.startswith('darwin'):
-            #~ self.skipTest("Projection is not implemented on CPU for OS X yet")
+        # ~ if sys.platform.startswith('darwin'):
+            # ~ self.skipTest("Projection is not implemented on CPU for OS X yet")
         self.getfiles()
         n_angles = self.sino.shape[0]
         self.proj = projection.Projection(self.phantom.shape, n_angles)
-        if self.proj.compiletime_workgroup_size < 16:
+        if self.proj.compiletime_workgroup_size < 16 * 16:
             self.skipTest("Current implementation of OpenCL projection is not supported on this platform yet")
 
-
     def tearDown(self):
         self.phantom = None
         self.sino = None
         self.proj = None
 
-
     def getfiles(self):
         # load 512x512 MRI phantom
         self.phantom = np.load(utilstest.getfile("Brain512.npz"))["data"]
         # load sinogram computed with PyHST
         self.sino = np.load(utilstest.getfile("sino500_pyhst.npz"))["data"]
 
-
     def measure(self):
         "Common measurement of timings"
         t1 = time.time()
@@ -89,7 +85,6 @@ class TestProj(unittest.TestCase):
         t2 = time.time()
         return t2 - t1, result
 
-
     def compare(self, res):
         """
         Compare a result with the reference reconstruction.
@@ -100,7 +95,6 @@ class TestProj(unittest.TestCase):
         ref = self.sino
         return np.max(np.abs(res - ref))
 
-
     @unittest.skipUnless(ocl and mako, "pyopencl is missing")
     def test_proj(self):
         """
@@ -127,13 +121,11 @@ class TestProj(unittest.TestCase):
             self.assertTrue(errmax < 1.e-6, "Max error is too high")
 
 
-
 def suite():
     testSuite = unittest.TestSuite()
     testSuite.addTest(TestProj("test_proj"))
     return testSuite
 
 
-
 if __name__ == '__main__':
     unittest.main(defaultTest="suite")