21 files changed, 4964 insertions, 0 deletions
diff --git a/src/silx/resources/opencl/addition.cl b/src/silx/resources/opencl/addition.cl
new file mode 100644
index 0000000..35d7996
--- /dev/null
+++ b/src/silx/resources/opencl/addition.cl
@@ -0,0 +1,42 @@
+/*
+ *   Project: SIFT: An algorithm for image alignement
+ *
+ *   Copyright (C) 2013-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+// "Hello_world" kernel to test if OpenCL is actually working
+kernel void addition(global float* a,
+                     global float* b,
+                     global float* res,
+                     int N)
+{
+    int i = get_global_id(0);
+    if( i<N ){
+        res[i] = a[i] + b[i];
+    }
+}
diff --git a/src/silx/resources/opencl/array_utils.cl b/src/silx/resources/opencl/array_utils.cl
new file mode 100644
index 0000000..6f78921
--- /dev/null
+++ b/src/silx/resources/opencl/array_utils.cl
@@ -0,0 +1,73 @@
+/**
+ *  2D Memcpy for float* arrays,
+ * replacing pyopencl "enqueue_copy" which does not work for rectangular copies.
+ * ALL THE SIZES/OFFSETS ARE SPECIFIED IN PIXELS, NOT IN BYTES.
+ * In the (x, y) convention, x is the fast index (as in CUDA).
+ *
+ * :param dst: destination array
+ * :param src: source array
+ * :param dst_width: width of the dst array
+ * :param src_width: width of the src array
+ * :param dst_offset: tuple with the offset (x, y) in the dst array
+ * :param src_offset: tuple with the offset (x, y) in the src array
+ * :param transfer_shape: shape of the transfer array in the form (x, y)
+ *
+ */
+kernel void cpy2d(
+    global float* dst,
+    global float* src,
+    int dst_width,
+    int src_width,
+    int2 dst_offset,
+    int2 src_offset,
+    int2 transfer_shape)
+{
+    int gidx = get_global_id(0), gidy = get_global_id(1);
+    if (gidx < transfer_shape.x && gidy < transfer_shape.y) {
+        dst[(dst_offset.y + gidy)*dst_width + (dst_offset.x + gidx)] = src[(src_offset.y + gidy)*src_width + (src_offset.x + gidx)];
+    }
+}
+
+
+// Looks like cfloat_t and cfloat_mul are not working, yet specified in
+// pyopencl documentation. Here we are using float2 as in all available examples
+// #include <pyopencl-complex.h>
+// typedef cfloat_t complex;
+
+static inline float2 complex_mul(float2 a, float2 b) {
+    float2 res = (float2) (0, 0);
+    res.x = a.x * b.x - a.y * b.y;
+    res.y = a.y * b.x + a.x * b.y;
+    return res;
+}
+
+// arr2D *= arr1D (line by line, i.e along fast dim)
+kernel void inplace_complex_mul_2Dby1D(
+    global float2* arr2D,
+    global float2* arr1D,
+    int width,
+    int height)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if ((x >= width) || (y >= height)) return;
+    int i = y*width + x;
+    arr2D[i] = complex_mul(arr2D[i], arr1D[x]);
+}
+
+
+// arr3D *= arr1D (along fast dim)
+kernel void inplace_complex_mul_3Dby1D(
+    global float2* arr3D,
+    global float2* arr1D,
+    int width,
+    int height,
+    int depth)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+    if ((x >= width) || (y >= height) || (z >= depth)) return;
+    int i = (z*height + y)*width + x;
+    arr3D[i] = complex_mul(arr3D[i], arr1D[x]);
+}
diff --git a/src/silx/resources/opencl/backproj.cl b/src/silx/resources/opencl/backproj.cl
new file mode 100644
index 0000000..da15131
--- /dev/null
+++ b/src/silx/resources/opencl/backproj.cl
@@ -0,0 +1,232 @@
+/*
+ *   Project: silx: filtered backprojection
+ *
+ *   Copyright (C) 2016-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: A. Mirone
+ *                      P. Paleo
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/*******************************************************************************/
+/************************ GPU VERSION (with textures) **************************/
+/*******************************************************************************/
+
+#ifndef DONT_USE_TEXTURES
+kernel void backproj_kernel(
+    int num_proj,
+    int num_bins,
+    float axis_position,
+    global float *d_SLICE,
+    read_only image2d_t d_sino,
+    float gpu_offset_x,
+    float gpu_offset_y,
+    global float * d_cos_s,  // precalculated cos(theta[i])
+    global float* d_sin_s,   // precalculated sin(theta[i])
+    global float* d_axis_s,  // array of axis positions (n_projs)
+    local float* shared2)    // 768B of local mem
+{
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+    const int tidx = get_local_id(0); //threadIdx.x;
+    const int bidx = get_group_id(0); //blockIdx.x;
+    const int tidy = get_local_id(1); //threadIdx.y;
+    const int bidy = get_group_id(1); //blockIdx.y;
+
+    local float sh_cos[256];
+    local float sh_sin[256];
+    local float sh_axis[256];
+
+    float pcos, psin;
+    float h0, h1, h2, h3;
+    const float apos_off_x= gpu_offset_x - axis_position ;
+    const float apos_off_y= gpu_offset_y - axis_position ;
+    float acorr05;
+    float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
+
+    const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x  ) ;
+    const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y  ) ;
+
+    int read=0;
+    for(int proj=0; proj<num_proj; proj++) {
+        if(proj>=read) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ip = tidy*16+tidx;
+            if( read+ip < num_proj) {
+                sh_cos [ip] = d_cos_s[read+ip] ;
+                sh_sin [ip] = d_sin_s[read+ip] ;
+                sh_axis[ip] = d_axis_s[read+ip] ;
+            }
+            read=read+256; // 256=16*16 block size
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        pcos = sh_cos[256-read + proj] ;
+        psin = sh_sin[256-read + proj] ;
+
+        acorr05 = sh_axis[256 - read + proj] ;
+
+        h0 = (acorr05 + bx00*pcos - by00*psin);
+        h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
+        h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
+        h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
+
+        if(h0>=0 && h0<num_bins) res0 += read_imagef(d_sino, sampler, (float2) (h0 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h0 +0.5f,proj +0.5f);
+        if(h1>=0 && h1<num_bins) res1 += read_imagef(d_sino, sampler, (float2) (h1 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h1 +0.5f,proj +0.5f);
+        if(h2>=0 && h2<num_bins) res2 += read_imagef(d_sino, sampler, (float2) (h2 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h2 +0.5f,proj +0.5f);
+        if(h3>=0 && h3<num_bins) res3 += read_imagef(d_sino, sampler, (float2) (h3 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h3 +0.5f,proj +0.5f);
+    }
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
+}
+#endif
+
+
+
+
+/*******************************************************************************/
+/********************* CPU VERSION (without textures) **************************/
+/*******************************************************************************/
+
+
+#define CLIP_MAX(x, N) (fmin(fmax(x, 0.0f), (N - 1.0f)))
+
+#define FLOORCEIL_x(x) {\
+    xm = (int) floor(x);\
+    xp = (int) ceil(x);\
+}
+
+#define ADJACENT_PIXELS_VALS(arr, Nx, y, xm, xp) ((float2) (arr[y*Nx+xm], arr[y*Nx+xp]))
+
+//Simple linear interpolator for working on the GPU
+static float linear_interpolation(float2 vals,
+                                  float x,
+                                  int xm,
+                                  int xp)
+{
+    if (xm == xp)
+        return vals.s0;
+    else
+        return (vals.s0 * (xp - x)) + (vals.s1 * (x - xm));
+}
+
+/**
+ *
+ *  Same kernel as backproj_kernel, but targets the CPU (no texture)
+ *
+**/
+kernel void backproj_cpu_kernel(
+    int num_proj,
+    int num_bins,
+    float axis_position,
+    global float *d_SLICE,
+    global float* d_sino,
+    float gpu_offset_x,
+    float gpu_offset_y,
+    global float * d_cos_s, // precalculated cos(theta[i])
+    global float * d_sin_s, // precalculated sin(theta[i])
+    global float * d_axis_s, // array of axis positions (n_projs)
+    local float* shared2)     // 768B of local mem
+{
+    const int tidx = get_local_id(0); //threadIdx.x;
+    const int bidx = get_group_id(0); //blockIdx.x;
+    const int tidy = get_local_id(1); //threadIdx.y;
+    const int bidy = get_group_id(1); //blockIdx.y;
+
+    local float sh_cos[256];
+    local float sh_sin[256];
+    local float sh_axis[256];
+
+    float pcos, psin;
+    float h0, h1, h2, h3;
+    const float apos_off_x= gpu_offset_x - axis_position ;
+    const float apos_off_y= gpu_offset_y - axis_position ;
+    float acorr05;
+    float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
+
+    const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x  ) ;
+    const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y  ) ;
+
+    int read=0;
+    for(int proj=0; proj<num_proj; proj++) {
+        if(proj>=read) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ip = tidy*16+tidx;
+            if( read+ip < num_proj) {
+                sh_cos [ip] = d_cos_s[read+ip] ;
+                sh_sin [ip] = d_sin_s[read+ip] ;
+                sh_axis[ip] = d_axis_s[read+ip] ;
+            }
+            read=read+256; // 256=16*16 block size
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        pcos = sh_cos[256-read + proj] ;
+        psin = sh_sin[256-read + proj] ;
+
+        acorr05 = sh_axis[256 - read + proj] ;
+
+        h0 = (acorr05 + bx00*pcos - by00*psin);
+        h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
+        h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
+        h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
+
+
+        float x;
+        int ym, xm, xp;
+        ym = proj;
+        float2 vals;
+
+        if(h0>=0 && h0<num_bins) {
+            x = CLIP_MAX(h0, num_bins);
+            FLOORCEIL_x(x);
+            vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+            res0 += linear_interpolation(vals, x, xm, xp);
+        }
+        if(h1>=0 && h1<num_bins) {
+            x = CLIP_MAX(h1, num_bins);
+            FLOORCEIL_x(x);
+            vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+            res1 += linear_interpolation(vals, x, xm, xp);
+        }
+        if(h2>=0 && h2<num_bins) {
+            x = CLIP_MAX(h2, num_bins);
+            FLOORCEIL_x(x);
+            vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+            res2 += linear_interpolation(vals, x, xm, xp);
+        }
+        if(h3>=0 && h3<num_bins) {
+            x = CLIP_MAX(h3, num_bins);
+            FLOORCEIL_x(x);
+            vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+            res3 += linear_interpolation(vals, x, xm, xp);
+        }
+    }
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
+}
+
diff --git a/src/silx/resources/opencl/backproj_helper.cl b/src/silx/resources/opencl/backproj_helper.cl
new file mode 100644
index 0000000..b1590f8
--- /dev/null
+++ b/src/silx/resources/opencl/backproj_helper.cl
@@ -0,0 +1,68 @@
+/*
+ *   Project: silx: backprojection helper functions
+ *
+ *   Copyright (C) 2016-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: P. Paleo
+ *                      J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+kernel void mult(   global float2* d_sino,
+                    global float2* d_filter,
+                    int num_bins,
+                    int num_projs)
+{
+    int gid0 = get_global_id(0);
+    int gid1 = get_global_id(1);
+    if (gid0 < num_bins && gid1 < num_projs)
+    {
+        // d_sino[gid1*num_bins+gid0] *= d_filter[gid0];
+        d_sino[gid1*num_bins+gid0].x *= d_filter[gid0].x;
+        d_sino[gid1*num_bins+gid0].y *= d_filter[gid0].x;
+      }
+}
+
+// copy only the real part of the valid data to the real array
+kernel void cpy2d_c2r(
+                      global float* d_sino,
+                      global float2* d_sino_complex,
+                      int num_bins,
+                      int num_projs,
+                      int fft_size)
+{
+    int gid0 = get_global_id(0);
+    int gid1 = get_global_id(1);
+    if (gid0 < num_bins && gid1 < num_projs) {
+        d_sino[gid1*num_bins+gid0] = d_sino_complex[gid1*fft_size+gid0].x;
+    }
+}
+
+
+
+
+
+
+
diff --git a/src/silx/resources/opencl/bitonic.cl b/src/silx/resources/opencl/bitonic.cl
new file mode 100644
index 0000000..4096ce8
--- /dev/null
+++ b/src/silx/resources/opencl/bitonic.cl
@@ -0,0 +1,569 @@
+/*############################################################################
+# Sort elements within a vector by Matthew Scarpino,
+# Taken from his book "OpenCL in Action",
+# November 2011 ISBN 9781617290176
+# Original license for the code: "public domain"
+#
+# Originally this code is public domain. The MIT license has been added
+# by J. Kieffer (jerome.kieffer@esrf.eu) to provide a disclaimer.
+# J. Kieffer does not claim authorship of this code developed by .
+#
+# Copyright (c) 2011 Matthew Scarpino
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+#############################################################################*/
+
+#define VECTOR_SORT_BOOK(input, dir) \
+        comp = abs(input > shuffle(input, mask2)) ^ dir; \
+        input = shuffle(input, comp * 2 + add2); \
+        comp = abs(input > shuffle(input, mask1)) ^ dir; \
+        input = shuffle(input, comp + add1); \
+
+
+#define VECTOR_SWAP_BOOK(in1, in2, dir) \
+        input1 = in1; input2 = in2; \
+        comp = (abs(input1 > input2) ^ dir) * 4 + add3; \
+        in1 = shuffle2(input1, input2, comp); \
+        in2 = shuffle2(input2, input1, comp); \
+
+
+// The _FILE extension correspond to the formula found in the "OpenCL in Action" supplementary files
+#define VECTOR_SORT_FILE(input, dir)\
+        comp = (input < shuffle(input, mask2)) ^ dir;\
+        input = shuffle(input, as_uint4(comp * 2 + add2));\
+        comp = (input < shuffle(input, mask1)) ^ dir;\
+        input = shuffle(input, as_uint4(comp + add1));\
+
+
+#define VECTOR_SWAP_FILE(input1, input2, dir)\
+        temp = input1;\
+        comp = ((input1 < input2) ^ dir) * 4 + add3;\
+        input1 = shuffle2(input1, input2, as_uint4(comp));\
+        input2 = shuffle2(input2, temp, as_uint4(comp));\
+
+
+
+// Functions to be called from an actual kernel.
+
+static float8 my_sort_file(uint local_id, uint group_id, uint local_size,
+                           float8 input, local float4 *l_data)
+{
+    float4 input1, input2, temp;
+    float8 output;
+
+    int dir;
+    uint id, size, stride;
+    int4 comp;
+
+    uint4 mask1 = (uint4)(1, 0, 3, 2);
+    uint4 mask2 = (uint4)(2, 3, 0, 1);
+    uint4 mask3 = (uint4)(3, 2, 1, 0);
+
+    int4 add1 = (int4)(1, 1, 3, 3);
+    int4 add2 = (int4)(2, 3, 2, 3);
+    int4 add3 = (int4)(1, 2, 2, 3);
+
+    // retrieve input data
+    input1 = (float4)(input.s0, input.s1, input.s2, input.s3);
+    input2 = (float4)(input.s4, input.s5, input.s6, input.s7);
+
+    // Find global address
+    id = local_id * 2;
+
+    /* Sort input 1 - ascending */
+    comp = input1 < shuffle(input1, mask1);
+    input1 = shuffle(input1, as_uint4(comp + add1));
+    comp = input1 < shuffle(input1, mask2);
+    input1 = shuffle(input1, as_uint4(comp * 2 + add2));
+    comp = input1 < shuffle(input1, mask3);
+    input1 = shuffle(input1, as_uint4(comp + add3));
+
+    /* Sort input 2 - descending */
+    comp = input2 > shuffle(input2, mask1);
+    input2 = shuffle(input2, as_uint4(comp + add1));
+    comp = input2 > shuffle(input2, mask2);
+    input2 = shuffle(input2, as_uint4(comp * 2 + add2));
+    comp = input2 > shuffle(input2, mask3);
+    input2 = shuffle(input2, as_uint4(comp + add3));
+
+    /* Swap corresponding elements of input 1 and 2 */
+    add3 = (int4)(4, 5, 6, 7);
+    dir = - (int) (local_id % 2);
+    temp = input1;
+    comp = ((input1 < input2) ^ dir) * 4 + add3;
+    input1 = shuffle2(input1, input2, as_uint4(comp));
+    input2 = shuffle2(input2, temp, as_uint4(comp));
+
+    /* Sort data and store in local memory */
+    VECTOR_SORT_FILE(input1, dir);
+    VECTOR_SORT_FILE(input2, dir);
+    l_data[id] = input1;
+    l_data[id+1] = input2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    /* Create bitonic set */
+    for(size = 2; size < local_size; size <<= 1) {
+      dir = - (int) (local_id/size & 1) ;
+
+      for(stride = size; stride > 1; stride >>= 1) {
+         barrier(CLK_LOCAL_MEM_FENCE);
+         id = local_id + (local_id/stride)*stride;
+         VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+      }
+
+      barrier(CLK_LOCAL_MEM_FENCE);
+      id = local_id * 2;
+      input1 = l_data[id];
+      input2 = l_data[id+1];
+      temp = input1;
+      comp = ((input1 < input2) ^ dir) * 4 + add3;
+      input1 = shuffle2(input1, input2, as_uint4(comp));
+      input2 = shuffle2(input2, temp, as_uint4(comp));
+      VECTOR_SORT_FILE(input1, dir);
+      VECTOR_SORT_FILE(input2, dir);
+      l_data[id] = input1;
+      l_data[id+1] = input2;
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    /* Perform bitonic merge */
+    dir = - (int) (group_id % 2);
+    for(stride = local_size; stride > 1; stride >>= 1) 
+    {
+      barrier(CLK_LOCAL_MEM_FENCE);
+      id = local_id + (local_id/stride)*stride;
+      VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* Perform final sort */
+    id = local_id * 2;
+    input1 = l_data[id]; 
+    input2 = l_data[id+1];
+    temp = input1;
+    comp = ((input1 < input2) ^ dir) * 4 + add3;
+    input1 = shuffle2(input1, input2, as_uint4(comp));
+    input2 = shuffle2(input2, temp, as_uint4(comp));
+    VECTOR_SORT_FILE(input1, dir);
+    VECTOR_SORT_FILE(input2, dir);
+
+    // setup output and return it
+    output = (float8)(input1, input2);
+    return  output;
+}
+
+static float8 my_sort_book(uint local_id, uint group_id, uint local_size,
+                           float8 input, local float4 *l_data)
+{
+    float4 input1, input2, temp;
+    float8 output;
+    uint4 comp, swap, mask1, mask2, add1, add2, add3;
+    uint id, dir, size, stride;
+    mask1 = (uint4)(1, 0, 3, 2);
+    swap = (uint4)(0, 0, 1, 1);
+    add1 = (uint4)(0, 0, 2, 2);
+    mask2 = (uint4)(2, 3, 0, 1);
+    add2 = (uint4)(0, 1, 0, 1);
+    add3 = (uint4)(0, 1, 2, 3);
+
+    // retrieve input data
+    input1 = (float4)(input.s0, input.s1, input.s2, input.s3);
+    input2 = (float4)(input.s4, input.s5, input.s6, input.s7);
+
+    // Find global address
+    id = local_id * 2;
+
+    //Sort first vector
+
+    comp = abs(input1 > shuffle(input1, mask1));
+    input1 = shuffle(input1, comp ^ swap + add1);
+    comp = abs(input1 > shuffle(input1, mask2));
+    input1 = shuffle(input1, comp * 2 + add2);
+    comp = abs(input1 > shuffle(input1, mask1));
+    input1 = shuffle(input1, comp + add1);
+
+    //Sort second vector
+    comp = abs(input2 < shuffle(input2, mask1));
+    input2 = shuffle(input2, comp ^ swap + add1);
+    comp = abs(input2 < shuffle(input2, mask2));
+    input2 = shuffle(input2, comp * 2 + add2);
+    comp = abs(input2 < shuffle(input2, mask1));
+    input2 = shuffle(input2, comp + add1);
+
+    // Swap elements
+    dir = local_id % 2;
+    temp = input1;
+    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+    input1 = shuffle2(input1, input2, comp);
+    input2 = shuffle2(input2, temp, comp);
+    VECTOR_SORT_BOOK(input1, dir);
+    VECTOR_SORT_BOOK(input2, dir);
+    l_data[id] = input1;
+    l_data[id+1] = input2;
+
+    // Perform upper stages
+    for(size = 2; size < local_size;    size <<= 1) 
+    {
+        dir = local_id/size & 1;
+
+        //Perform    lower stages
+        for(stride = size; stride > 1; stride >>= 1) 
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            id = local_id +    (local_id/stride)*stride;
+            VECTOR_SWAP_BOOK(l_data[id],    l_data[id + stride], dir)
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        id = local_id * 2;
+        input1 = l_data[id];
+        input2 = l_data[id+1];
+        temp = input1;
+        comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+        input1 = shuffle2(input1, input2, comp);
+        input2 = shuffle2(input2, temp, comp);
+        VECTOR_SORT_BOOK(input1, dir);
+        VECTOR_SORT_BOOK(input2, dir);
+        l_data[id] = input1;
+        l_data[id+1] = input2;
+    }
+    dir = group_id % 2;
+
+    // Perform bitonic merge
+    for(stride = local_size; stride > 1; stride >>= 1) 
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        id = local_id +    (local_id/stride)*stride;
+        VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    id = local_id * 2;
+    input1 = l_data[id]; input2 = l_data[id+1];
+    temp = input1;
+    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+    input1 = shuffle2(input1, input2, comp);
+    input2 = shuffle2(input2, temp, comp);
+    VECTOR_SORT_BOOK(input1, dir);
+    VECTOR_SORT_BOOK(input2, dir);
+
+    // setup output and return it
+    output = (float8)(input1, input2);
+    return  output;
+}
+
+
+
+//////////////
+// Kernels
+//////////////
+
+// Perform the sort on the whole array
+// dim0: wg=number_of_element/8
+
+kernel void bsort_all(global float4 *g_data,
+                      local float4 *l_data)
+{
+    float4 input1, input2;
+    float8 input, output;
+    uint id, global_start;
+    // Find global address
+    id = get_local_id(0) * 2;
+    global_start = get_group_id(0) * get_local_size(0) * 2 + id;
+
+    input1 = g_data[global_start];
+    input2 = g_data[global_start+1];
+    input = (float8)(input1, input2);
+    output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
+                          input, l_data);
+    input1 = (float4) (output.s0, output.s1, output.s2, output.s3);
+    input2 = (float4) (output.s4, output.s5, output.s6, output.s7);
+    g_data[global_start] = input1;
+    g_data[global_start+1] = input2;
+}
+
+
+// Perform the sort along the horizontal axis of a 2D image
+// dim0 = y: wg=1
+// dim1 = x: wg=number_of_element/8
+kernel void bsort_horizontal(global float *g_data,
+                             local float4 *l_data) 
+{
+    float8 input, output;
+    uint id, global_start, offset;
+
+    // Find global address
+    offset = get_global_size(1)*get_global_id(0)*8;
+    id = get_local_id(1) * 8;
+    global_start = offset + get_group_id(1) * get_local_size(1) * 8 + id;
+
+    input = (float8)(g_data[global_start    ],
+                     g_data[global_start + 1],
+                     g_data[global_start + 2],
+                     g_data[global_start + 3],
+                     g_data[global_start + 4],
+                     g_data[global_start + 5],
+                     g_data[global_start + 6],
+                     g_data[global_start + 7]);
+
+    output = my_sort_file(get_local_id(1), get_group_id(1), get_local_size(1),
+                          input, l_data);
+
+    g_data[global_start    ] = output.s0;
+    g_data[global_start + 1] = output.s1;
+    g_data[global_start + 2] = output.s2;
+    g_data[global_start + 3] = output.s3;
+    g_data[global_start + 4] = output.s4;
+    g_data[global_start + 5] = output.s5;
+    g_data[global_start + 6] = output.s6;
+    g_data[global_start + 7] = output.s7;
+}
+
+
+// Perform the sort along the vertical axis
+// dim0 = y: wg=number_of_element/8
+// dim1 = x: wg=1
+// check if transposing +bsort_horizontal is not more efficient ?
+
+kernel void bsort_vertical(global float *g_data,
+                           local float4 *l_data) 
+{
+    // we need to read 8 float position along the vertical axis
+    float8 input, output;
+    uint id, global_start, padding;
+
+    // Find global address
+    padding = get_global_size(1);
+    id = get_local_id(0) * 8 * padding + get_global_id(1);
+    global_start = get_group_id(0) * get_local_size(0) * 8 * padding + id;
+
+    input = (float8)(g_data[global_start            ],
+                     g_data[global_start + padding  ],
+                     g_data[global_start + 2*padding],
+                     g_data[global_start + 3*padding],
+                     g_data[global_start + 4*padding],
+                     g_data[global_start + 5*padding],
+                     g_data[global_start + 6*padding],
+                     g_data[global_start + 7*padding]);
+
+      output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
+                            input, l_data);
+      g_data[global_start             ] = output.s0;
+      g_data[global_start + padding   ] = output.s1;
+      g_data[global_start + 2*padding ] = output.s2;
+      g_data[global_start + 3*padding ] = output.s3;
+      g_data[global_start + 4*padding ] = output.s4;
+      g_data[global_start + 5*padding ] = output.s5;
+      g_data[global_start + 6*padding ] = output.s6;
+      g_data[global_start + 7*padding ] = output.s7;
+}
+
+
+//Tested working reference kernel frm the book. This only works under Linux
+kernel void bsort_book(global float4 *g_data,
+                         local float4 *l_data) {
+    float4 input1, input2, temp;
+    uint4 comp, swap, mask1, mask2, add1, add2, add3;
+    uint id, dir, global_start, size, stride;
+    mask1 = (uint4)(1, 0, 3, 2);
+    swap = (uint4)(0, 0, 1, 1);
+    add1 = (uint4)(0, 0, 2, 2);
+    mask2 = (uint4)(2, 3, 0, 1);
+    add2 = (uint4)(0, 1, 0, 1);
+    add3 = (uint4)(0, 1, 2, 3);
+
+    // Find global address
+    id = get_local_id(0) * 2;
+    global_start = get_group_id(0) * get_local_size(0) * 2 + id;
+
+    //Sort first vector
+    input1 = g_data[global_start];
+    input2 = g_data[global_start+1];
+    comp = abs(input1 > shuffle(input1, mask1));
+    input1 = shuffle(input1, comp ^ swap + add1);
+    comp = abs(input1 > shuffle(input1, mask2));
+    input1 = shuffle(input1, comp * 2 + add2);
+    comp = abs(input1 > shuffle(input1, mask1));
+    input1 = shuffle(input1, comp + add1);
+
+    //Sort second vector
+    comp = abs(input2 < shuffle(input2, mask1));
+    input2 = shuffle(input2, comp ^ swap + add1);
+    comp = abs(input2 < shuffle(input2, mask2));
+    input2 = shuffle(input2, comp * 2 + add2);
+    comp = abs(input2 < shuffle(input2, mask1));
+    input2 = shuffle(input2, comp + add1);
+
+    // Swap elements
+    dir = get_local_id(0) % 2;
+    temp = input1;
+    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+    input1 = shuffle2(input1, input2, comp);
+    input2 = shuffle2(input2, temp, comp);
+    VECTOR_SORT_BOOK(input1, dir);
+    VECTOR_SORT_BOOK(input2, dir);
+    l_data[id] = input1;
+    l_data[id+1] = input2;
+
+    // Perform upper stages
+    for(size = 2; size < get_local_size(0);    size <<= 1) {
+        dir = get_local_id(0)/size & 1;
+
+        //Perform    lower stages
+        for(stride = size; stride > 1; stride >>= 1) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            id = get_local_id(0) +
+            (get_local_id(0)/stride)*stride;
+            VECTOR_SWAP_BOOK(l_data[id],    l_data[id + stride], dir)
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        id = get_local_id(0) * 2;
+        input1 = l_data[id];
+        input2 = l_data[id+1];
+        temp = input1;
+        comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+        input1 = shuffle2(input1, input2, comp);
+        input2 = shuffle2(input2, temp, comp);
+        VECTOR_SORT_BOOK(input1, dir);
+        VECTOR_SORT_BOOK(input2, dir);
+        l_data[id] = input1;
+        l_data[id+1] = input2;
+    }
+    dir = get_group_id(0) % 2;
+    // Perform bitonic merge
+    for(stride = get_local_size(0); stride > 1;    stride >>= 1) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        id = get_local_id(0) +
+        (get_local_id(0)/stride)*stride;
+        VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    id = get_local_id(0) * 2;
+    input1 = l_data[id]; input2 = l_data[id+1];
+    temp = input1;
+    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+    input1 = shuffle2(input1, input2, comp);
+    input2 = shuffle2(input2, temp, comp);
+    VECTOR_SORT_BOOK(input1, dir);
+    VECTOR_SORT_BOOK(input2, dir);
+    g_data[global_start] = input1;
+    g_data[global_start+1] = input2;
+    }
+
+//Tested working reference kernel from the addition files. This only works under any operating system
+/* Perform initial sort */
+kernel void bsort_file(global float4 *g_data, local float4 *l_data) {
+
+   int dir;
+   uint id, global_start, size, stride;
+   float4 input1, input2, temp;
+   int4 comp;
+
+   uint4 mask1 = (uint4)(1, 0, 3, 2);
+   uint4 mask2 = (uint4)(2, 3, 0, 1);
+   uint4 mask3 = (uint4)(3, 2, 1, 0);
+
+   int4 add1 = (int4)(1, 1, 3, 3);
+   int4 add2 = (int4)(2, 3, 2, 3);
+   int4 add3 = (int4)(1, 2, 2, 3);
+
+   id = get_local_id(0) * 2;
+   global_start = get_group_id(0) * get_local_size(0) * 2 + id;
+
+   input1 = g_data[global_start];
+   input2 = g_data[global_start+1];
+
+   /* Sort input 1 - ascending */
+   comp = input1 < shuffle(input1, mask1);
+   input1 = shuffle(input1, as_uint4(comp + add1));
+   comp = input1 < shuffle(input1, mask2);
+   input1 = shuffle(input1, as_uint4(comp * 2 + add2));
+   comp = input1 < shuffle(input1, mask3);
+   input1 = shuffle(input1, as_uint4(comp + add3));
+
+   /* Sort input 2 - descending */
+   comp = input2 > shuffle(input2, mask1);
+   input2 = shuffle(input2, as_uint4(comp + add1));
+   comp = input2 > shuffle(input2, mask2);
+   input2 = shuffle(input2, as_uint4(comp * 2 + add2));
+   comp = input2 > shuffle(input2, mask3);
+   input2 = shuffle(input2, as_uint4(comp + add3));
+
+   /* Swap corresponding elements of input 1 and 2 */
+   add3 = (int4)(4, 5, 6, 7);
+   dir = - (int)(get_local_id(0) % 2);
+   temp = input1;
+   comp = ((input1 < input2) ^ dir) * 4 + add3;
+   input1 = shuffle2(input1, input2, as_uint4(comp));
+   input2 = shuffle2(input2, temp, as_uint4(comp));
+
+   /* Sort data and store in local memory */
+   VECTOR_SORT_FILE(input1, dir);
+   VECTOR_SORT_FILE(input2, dir);
+   l_data[id] = input1;
+   l_data[id+1] = input2;
+
+   /* Create bitonic set */
+   for(size = 2; size < get_local_size(0); size <<= 1) {
+      dir = - (int)(get_local_id(0)/size & 1);
+
+      for(stride = size; stride > 1; stride >>= 1) {
+         barrier(CLK_LOCAL_MEM_FENCE);
+         id = get_local_id(0) + (get_local_id(0)/stride)*stride;
+         VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+      }
+
+      barrier(CLK_LOCAL_MEM_FENCE);
+      id = get_local_id(0) * 2;
+      input1 = l_data[id]; input2 = l_data[id+1];
+      temp = input1;
+      comp = ((input1 < input2) ^ dir) * 4 + add3;
+      input1 = shuffle2(input1, input2, as_uint4(comp));
+      input2 = shuffle2(input2, temp, as_uint4(comp));
+      VECTOR_SORT_FILE(input1, dir);
+      VECTOR_SORT_FILE(input2, dir);
+      l_data[id] = input1;
+      l_data[id+1] = input2;
+   }
+
+   /* Perform bitonic merge */
+   dir = - (int)(get_group_id(0) % 2);
+   for(stride = get_local_size(0); stride > 1; stride >>= 1) {
+      barrier(CLK_LOCAL_MEM_FENCE);
+      id = get_local_id(0) + (get_local_id(0)/stride)*stride;
+      VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   /* Perform final sort */
+   id = get_local_id(0) * 2;
+   input1 = l_data[id]; input2 = l_data[id+1];
+   temp = input1;
+   comp = ((input1 < input2) ^ dir) * 4 + add3;
+   input1 = shuffle2(input1, input2, as_uint4(comp));
+   input2 = shuffle2(input2, temp, as_uint4(comp));
+   VECTOR_SORT_FILE(input1, dir);
+   VECTOR_SORT_FILE(input2, dir);
+   g_data[global_start] = input1;
+   g_data[global_start+1] = input2;
+}
+
diff --git a/src/silx/resources/opencl/codec/bitshuffle_lz4.cl b/src/silx/resources/opencl/codec/bitshuffle_lz4.cl
new file mode 100644
index 0000000..71f617a
--- /dev/null
+++ b/src/silx/resources/opencl/codec/bitshuffle_lz4.cl
@@ -0,0 +1,625 @@
+/*
+ *   Project: SILX: Bitshuffle LZ4 decompressor
+ *
+ *   Copyright (C) 2022 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* To decompress bitshuffle-LZ4 data in parallel on GPU one needs to:
+ * - Find all begining of blocks, this is performed by the ... kernel.
+ * - Decompress each block by one workgroup.
+ * - bitshuffle the data from one workgroup
+ */
+
+#ifndef LZ4_BLOCK_SIZE
+# define LZ4_BLOCK_SIZE 8192
+#endif
+#define LZ4_BLOCK_EXTRA 400
+#ifdef __ENDIAN_LITTLE__
+#define SWAP_BE 1
+#define SWAP_LE 0
+#else
+#define SWAP_BE 0
+#define SWAP_LE 1
+#endif
+
+
+#define int8_t char
+#define uint8_t uchar
+#define int16_t short
+#define uint16_t ushort
+#define int32_t int
+#define uint32_t uint
+#define int64_t long
+#define uint64_t ulong
+
+#define position_t uint
+#define token_t uchar2
+
+//Some function used  as part of bitshuffle:
+
+inline token_t decode_token(uint8_t value){
+    return (token_t)(value >> 4,     // literals
+                     value & 0x0f);  // matches
+}
+
+inline bool has_liter_over(token_t token)
+{
+    return token.s0 >= 15;
+}
+
+inline bool has_match_over(token_t token)
+{
+    return token.s1 >= 15;
+}
+
+//parse overflow, return the number of overflow and the new position
+inline uint2 read_overflow(local uint8_t* buffer, 
+                           position_t buffer_size,
+                           position_t idx){
+    position_t num = 0;
+    uint8_t next = 0xff;
+    while (next == 0xff && idx < buffer_size){
+        next = buffer[idx];
+        idx += 1;
+        num += next;
+    }
+    return (uint2)(num, idx);
+}
+
+inline void copy_no_overlap(local uint8_t* dest,
+                            const position_t dest_position,
+                            local uint8_t* source,
+                            const position_t src_position,
+                            const position_t length){
+    for (position_t i=get_local_id(0); i<length; i+=get_local_size(0)) {
+        dest[dest_position+i] = source[src_position+i];
+    }
+}
+
+inline void copy_repeat(local uint8_t* dest,
+                        const position_t dest_position,
+                        local uint8_t* source,
+                        const position_t src_position,
+                        const position_t dist,
+                        const position_t length){
+    
+    // if there is overlap, it means we repeat, so we just
+    // need to organize our copy around that
+    for (position_t i=get_local_id(0); i<length; i+=get_local_size(0)) {
+        dest[dest_position+i] = source[src_position + i%dist];
+    }
+}
+
+inline void copy_collab(local uint8_t* dest,
+                        const position_t dest_position,
+                        local uint8_t* source,
+                        const position_t src_position,    
+                        const position_t dist,
+                        const position_t length){
+    //Generic copy function
+    if (dist < length) {
+        copy_repeat(dest, dest_position, source, src_position, dist, length);
+    } 
+    else {
+        copy_no_overlap(dest, dest_position, source, src_position, length);
+    }
+}
+
+// Function to read larger integers at various position. Endianness is addressed as well with the swap flag
+uint64_t load64_at(global uint8_t *src,
+                   const uint64_t position,
+                   const bool swap){
+    uchar8 vector;
+    if (swap){
+        vector = (uchar8)(src[position+7],src[position+6],
+                          src[position+5],src[position+4],
+                          src[position+3],src[position+2],
+                          src[position+1],src[position+0]);
+    }
+    else{
+        vector = (uchar8)(src[position+0],src[position+1],
+                          src[position+2],src[position+3],
+                          src[position+4],src[position+5],
+                          src[position+6],src[position+7]);
+    }
+    return as_ulong(vector);
+}
+
+uint32_t load32_at(global uint8_t *src,
+                   const uint64_t position,
+                   const bool swap){
+    uchar4 vector;
+    if (swap){
+        vector = (uchar4)(
+                  src[position+3],src[position+2],
+                  src[position+1],src[position+0]);
+    }
+    else{
+        vector = (uchar4)(src[position+0],src[position+1],
+                          src[position+2],src[position+3]);
+    }
+    return as_uint(vector);
+}
+
+uint16_t load16_at(local uint8_t *src,
+                   const uint64_t position,
+                   const bool swap){
+    uchar2 vector;
+    if (swap){
+        vector = (uchar2)(src[position+1],src[position+0]);
+    }
+    else{
+        vector = (uchar2)(src[position+0],src[position+1]);
+    }
+    return as_ushort(vector);
+}
+
+//Calculate the begining and the end of the block corresponding to the block=gid
+inline void _lz4_unblock(global uint8_t *src,
+                           const uint64_t size,
+                           local uint64_t *block_position){
+    uint32_t gid = get_group_id(0);
+    uint32_t lid = get_local_id(0);    
+    if (lid == 0){
+        uint64_t block_start=16;
+        uint32_t block_size = load32_at(src, 12, SWAP_BE);
+        uint64_t block_end = block_start + block_size;
+        
+        for (uint32_t block_idx=0; block_idx<gid; block_idx++){
+            // printf("gid %u idx %u %lu-%lu\n",gid, block_idx,block_start,block_end);
+            block_start = block_end + 4;
+            if (block_start>=size){
+                printf("Read beyond end of source buffer at gid %u %lu>%lu\n",gid, block_start, size);
+                block_start = 0;
+                block_end = 0;
+                break;
+            }
+            block_size = load32_at(src, block_end, SWAP_BE);
+            block_end = block_start + block_size;
+        }
+        block_position[0] = block_start;
+        block_position[1] = block_end;
+//        if (gid>get_num_groups(0)-10) printf("Success finish unblock gid %u block: %lu - %lu\n",gid,block_start,block_end);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}                    
+
+
+//Decompress one block in shared memory
+inline uint32_t lz4_decompress_local_block( local uint8_t* local_cmp,
+                                            local uint8_t* local_dec,
+                                            const uint32_t cmp_buffer_size,
+                                            const uint32_t dec_buffer_size){
+    
+    uint32_t gid = get_group_id(0); // One block is decompressed by one workgroup
+    uint32_t lid = get_local_id(0); // This is the thread position in the group...
+    uint32_t wg = get_local_size(0); // workgroup size
+        
+    position_t dec_idx = 0;
+    position_t cmp_idx = 0;
+    while (cmp_idx < cmp_buffer_size) {
+         // read header byte
+        token_t tok = decode_token(local_cmp[cmp_idx]);
+        // if (lid==0)        printf("gid %u at idx %u/%u. Token is litterials: %u; matches: %u\n", gid, cmp_idx, cmp_buffer_size,tok.s0, tok.s1);
+
+        cmp_idx+=1;
+        
+        // read the length of the literals
+        position_t num_literals = tok.s0;
+        if (has_liter_over(tok)) {
+            uint2 tmp = read_overflow(local_cmp, 
+                                      cmp_buffer_size,
+                                      cmp_idx);
+            num_literals += tmp.s0;
+            cmp_idx = tmp.s1;
+        }
+        const position_t start_literal = cmp_idx;
+
+        // copy the literals to the dst stream in parallel
+        // if (lid==0) printf("gid %u: copy literals from %u to %u <%u (len %u)\n", gid, cmp_idx,num_literals+cmp_idx,cmp_buffer_size,num_literals);
+        copy_no_overlap(local_dec, dec_idx, local_cmp, cmp_idx, num_literals);
+        cmp_idx += num_literals;
+        dec_idx += num_literals;
+        
+        // Note that the last sequence stops right after literals field.
+        // There are specific parsing rules to respect to be compatible with the
+        // reference decoder : 1) The last 5 bytes are always literals 2) The last
+        // match cannot start within the last 12 bytes Consequently, a file with
+        // less then 13 bytes can only be represented as literals These rules are in
+        // place to benefit speed and ensure buffer limits are never crossed.
+        if (cmp_idx < cmp_buffer_size) {
+            
+          // read the offset
+          uint16_t offset = load16_at(local_cmp, cmp_idx, SWAP_LE);
+          // if (lid==0) printf("gid %u: offset is %u at %u\n",gid, offset, cmp_idx);
+          if (offset == 0) {
+              //corruped block
+              if (lid == 0) 
+                  printf("Corrupted block #%u\n", gid);
+              return 0;
+          }
+          
+          cmp_idx += 2;
+
+          // read the match length
+          position_t match = 4 + tok.s1;
+          if (has_match_over(tok)) {
+            uint2 tmp = read_overflow(local_cmp, 
+                                      cmp_buffer_size,
+                                      cmp_idx);
+            match += tmp.s0;
+            cmp_idx = tmp.s1;
+          }
+
+          //syncronize threads before reading shared memory
+          barrier(CLK_LOCAL_MEM_FENCE);
+          
+          // copy match
+          copy_collab(local_dec, dec_idx, local_dec, dec_idx - offset, offset, match);
+          dec_idx += match;
+        }
+    }
+    //syncronize threads before reading shared memory
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return dec_idx;
+}
+
+//Perform the bifshuffling on 8-bits objects
+inline void bitunshuffle8( local uint8_t* inp,
+                            local uint8_t* out,
+                            const uint32_t buffer_size){ //8k ... or less.
+//    uint32_t gid = get_group_id(0);
+    uint32_t lid = get_local_id(0);
+    uint32_t wg = get_local_size(0);
+    uint32_t u8_buffer_size = buffer_size; // /1 -> 8k
+
+    // One thread deals with one or several output data
+    for (uint32_t dpos=lid; dpos<u8_buffer_size; dpos+=wg){
+        uint8_t res = 0;
+        // read bits at several places...
+        for (uint32_t bit=0; bit<8; bit++){
+            uint32_t read_bit = bit*u8_buffer_size + dpos;
+            uint32_t u8_word_pos = read_bit>>3; // /8
+            uint32_t u8_bit_pos = read_bit&7; // %8
+            // if (lid==0) printf("dpos %u bit %u read at %u,%u\n",dpos,bit,u8_word_pos,u8_bit_pos);
+            res |= ((inp[u8_word_pos]>>u8_bit_pos) & 1)<<bit;
+        }
+        // if (lid==0) printf("dpos %u res %u\n",dpos,res);
+        out[dpos] = res;
+    }
+}
+
+
+//Perform the bifshuffling on 16-bits objects
+inline void bitunshuffle16( local uint8_t* inp,
+                            local uint8_t* out,
+                            const uint32_t buffer_size){ //8k ... or less.
+//    uint32_t gid = get_group_id(0);
+    uint32_t lid = get_local_id(0);
+    uint32_t wg = get_local_size(0);
+    uint32_t u16_buffer_size = buffer_size>>1; // /2 -> 4k
+
+    // One thread deals with one or several output data
+    for (uint32_t dpos=lid; dpos<u16_buffer_size; dpos+=wg){
+        uint16_t res = 0;
+        // read bits at several places...
+        for (uint32_t bit=0; bit<16; bit++){
+            uint32_t read_bit = bit*u16_buffer_size + dpos;
+            uint32_t u8_word_pos = read_bit>>3; // /8
+            uint32_t u8_bit_pos = read_bit&7; // %8
+            // if (lid==0) printf("dpos %u bit %u read at %u,%u\n",dpos,bit,u8_word_pos,u8_bit_pos);
+            res |= ((inp[u8_word_pos]>>u8_bit_pos) & 1)<<bit;
+        }
+        // if (lid==0) printf("dpos %u res %u\n",dpos,res);
+        uchar2 tmp = as_uchar2(res);
+        out[2*dpos] = tmp.s0;
+        out[2*dpos+1] = tmp.s1;
+    }
+}
+
+
+//Perform the bifshuffling on 32-bits objects
+inline void bitunshuffle32( local uint8_t* inp,
+                            local uint8_t* out,
+                            const uint32_t buffer_size){ //8k ... or less.
+//    uint32_t gid = get_group_id(0);
+    uint32_t lid = get_local_id(0);
+    uint32_t wg = get_local_size(0);
+    uint32_t u32_buffer_size = buffer_size>>2; // /4 -> 2k
+
+    // One thread deals with one or several output data
+    for (uint32_t dpos=lid; dpos<u32_buffer_size; dpos+=wg){
+        uint32_t res = 0;
+        // read bits at several places...
+        for (uint32_t bit=0; bit<32; bit++){
+            uint32_t read_bit = bit*u32_buffer_size + dpos;
+            uint32_t u8_word_pos = read_bit>>3; // /8
+            uint32_t u8_bit_pos = read_bit&7; // %8
+            // if (lid==0) printf("dpos %u bit %u read at %u,%u\n",dpos,bit,u8_word_pos,u8_bit_pos);
+            res |= ((inp[u8_word_pos]>>u8_bit_pos) & 1)<<bit;
+        }
+        // if (lid==0) printf("dpos %u res %u\n",dpos,res);
+        uchar4 tmp = as_uchar4(res);
+        out[4*dpos] = tmp.s0;
+        out[4*dpos+1] = tmp.s1;
+        out[4*dpos+2] = tmp.s2;
+        out[4*dpos+3] = tmp.s3;
+    }
+}
+
+//Perform the bifshuffling on 32-bits objects
+inline void bitunshuffle64( local uint8_t* inp,
+                            local uint8_t* out,
+                            const uint32_t buffer_size){ //8k ... or less.
+//    uint32_t gid = get_group_id(0);
+    uint32_t lid = get_local_id(0);
+    uint32_t wg = get_local_size(0);
+    uint32_t u64_buffer_size = buffer_size>>3; // /8 -> 1k
+
+    // One thread deals with one or several output data
+    for (uint32_t dpos=lid; dpos<u64_buffer_size; dpos+=wg){
+        uint64_t res = 0;
+        // read bits at several places...
+        for (uint32_t bit=0; bit<64; bit++){
+            uint32_t read_bit = bit*u64_buffer_size + dpos;
+            uint32_t u8_word_pos = read_bit>>3; // /8
+            uint32_t u8_bit_pos = read_bit&7; // %8
+            // if (lid==0) printf("dpos %u bit %u read at %u,%u\n",dpos,bit,u8_word_pos,u8_bit_pos);
+            res |= ((inp[u8_word_pos]>>u8_bit_pos) & 1)<<bit;
+        }
+        // if (lid==0) printf("dpos %u res %u\n",dpos,res);
+        uchar8 tmp = as_uchar8(res);
+        out[8*dpos] = tmp.s0;
+        out[8*dpos+1] = tmp.s1;
+        out[8*dpos+2] = tmp.s2;
+        out[8*dpos+3] = tmp.s3;
+        out[8*dpos+4] = tmp.s4;
+        out[8*dpos+5] = tmp.s5;
+        out[8*dpos+6] = tmp.s6;
+        out[8*dpos+7] = tmp.s7;
+    }
+}
+
+
+/* Preprocessing kernel which performs:
+- Memset arrays
+- read block position stored in block_position array
+
+Param: 
+- src: input buffer in global memory
+- size: input buffer size
+- block_position: output buffer in local memory containing the index of the begining of each block
+- max_blocks: allocated memory for block_position array (output)
+- nb_blocks: output buffer with the actual number of blocks in src (output).
+
+Return: Nothing, this is a kernel
+
+Hint on workgroup size: little kernel ... wg=1, 1 wg is enough.
+*/
+
+kernel void lz4_unblock(global uint8_t *src,
+                        const uint64_t size,
+                        global uint64_t *block_start,
+                        const uint32_t max_blocks,
+                        global uint32_t *nb_blocks){
+
+    uint64_t total_nbytes = load64_at(src,0,SWAP_BE);
+    uint32_t block_nbytes = load32_at(src,8,SWAP_BE);
+
+    uint32_t block_idx = 0;
+    uint64_t pos = 12;
+    uint32_t block_size;
+        
+    while ((pos+4<size) && (block_idx<max_blocks)){
+        block_size = load32_at(src, pos, SWAP_BE);
+        block_start[block_idx] = pos + 4;
+        block_idx +=1;
+        pos += 4 + block_size;
+    }
+    nb_blocks[0] = block_idx;
+}                    
+
+// decompress a frame blockwise. 
+// Needs the block position to be known in advance (block_start) calculated from lz4_unblock.
+// one workgroup treats on block.
+
+kernel void bslz4_decompress_block( global uint8_t* comp_src,
+                                    global uint8_t* dec_dest,
+                                    global uint64_t* block_start,
+                                    global uint32_t *nb_blocks,
+                                    const uint8_t item_size){
+    
+    uint32_t gid = get_group_id(0); // One block is decompressed by one workgroup
+    uint32_t lid = get_local_id(0); // This is the thread position in the group...
+    uint32_t wg = get_local_size(0); // workgroup size
+    
+    //guard if the number of wg scheduled is too large
+    if (gid >=nb_blocks[0]) return;
+    
+    // No need to guard, the number of blocks can be calculated in advance.
+    uint64_t start_read = block_start[gid];
+    if (start_read<12) return;
+    
+    local uint8_t local_cmp[LZ4_BLOCK_SIZE+LZ4_BLOCK_EXTRA];
+    local uint8_t local_dec[LZ4_BLOCK_SIZE];
+    
+    uint32_t cmp_buffer_size = load32_at(comp_src, start_read-4, SWAP_BE); 
+    uint64_t end_read = start_read + cmp_buffer_size;
+    // Copy locally the compressed buffer and memset the destination buffer
+    for (uint32_t i=lid; i<cmp_buffer_size; i+=wg){
+        uint64_t read_pos = start_read + i;
+        if (read_pos<end_read)
+            local_cmp[i] = comp_src[read_pos];
+        else
+            local_cmp[i] = 0;            
+    }
+    for (uint32_t i=lid+cmp_buffer_size; i<LZ4_BLOCK_SIZE+LZ4_BLOCK_EXTRA; i+=wg){
+            local_cmp[i] = 0;
+    }
+    for (uint32_t i=lid; i<LZ4_BLOCK_SIZE; i+=wg){
+            local_dec[i] = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //All the work is performed here:
+    uint32_t dec_size = lz4_decompress_local_block( local_cmp, local_dec, cmp_buffer_size, LZ4_BLOCK_SIZE);
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+    local uint8_t* local_buffer;
+    
+    //Perform bit-unshuffle
+    if (item_size == 1){
+//        if ((gid==0) && (lid==0)) printf("bitunshuffle8");
+        bitunshuffle8(local_dec, local_cmp, dec_size);
+        local_buffer=local_cmp;
+        }
+    else if (item_size == 2){
+//        if ((gid==0) && (lid==0)) printf("bitunshuffle16");
+        bitunshuffle16(local_dec, local_cmp, dec_size);
+        local_buffer=local_cmp;
+    }
+    else if (item_size == 4){
+//        if ((gid==0) && (lid==0)) printf("bitunshuffle32");
+        bitunshuffle32(local_dec, local_cmp, dec_size);
+        local_buffer=local_cmp;
+    }
+    else if (item_size == 8){
+//        if ((gid==0) && (lid==0)) printf("bitunshuffle64");
+        bitunshuffle64(local_dec, local_cmp, dec_size);
+        local_buffer=local_cmp;
+    }
+    else {
+        local_buffer = local_dec;
+    }
+
+        
+    //Finally copy the destination data from local to global memory:
+    uint64_t start_write = LZ4_BLOCK_SIZE*gid;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (uint32_t i=lid; i<dec_size; i+=wg){
+        dec_dest[start_write + i] = local_buffer[i];
+    }
+
+    if (gid+1==get_num_groups(0)){
+        uint64_t total_nbytes = load64_at(comp_src,0,SWAP_BE);
+        uint64_t end_write = dec_size + start_write;
+        int32_t remaining = total_nbytes - end_write;
+//        if (lid==0) printf("gid %u is last block has %u elements. Writing ends at %u/%lu, copy remaining %d\n",gid, dec_size, end_write, total_nbytes, remaining);
+        if ((remaining>0) && (remaining<item_size*8)){
+            for (uint32_t i=lid; i<remaining; i++){
+                dec_dest[end_write + i] = comp_src[end_read+i];
+            }
+        }
+    }
+
+}
+
+// decompress a frame blockwise. 
+// block-start are searched by one thread from each workgroup ... not very efficient 
+// one workgroup treats on block.
+
+kernel void bslz4_decompress_frame(
+    global uint8_t* comp_src,
+    const uint64_t src_size,
+    global uint8_t* dec_dest,
+    const uint8_t item_size){
+    
+    uint32_t gid = get_group_id(0); // One block is decompressed by one workgroup
+    uint32_t lid = get_local_id(0); // This is the thread position in the group...
+    uint32_t wg = get_local_size(0); // workgroup size
+    
+    local uint8_t local_cmp[LZ4_BLOCK_SIZE+LZ4_BLOCK_EXTRA];
+    local uint8_t local_dec[LZ4_BLOCK_SIZE];
+    local uint64_t block[2]; // will contain begining and end of the current block
+
+    uint64_t start_read, end_read;
+    uint32_t cmp_buffer_size;
+    _lz4_unblock(comp_src, src_size, block);
+    start_read = block[0];
+    end_read = block[1];
+    cmp_buffer_size = end_read - start_read; 
+    if (cmp_buffer_size == 0){
+        if (lid == 0) printf("gid=%u: Empty buffer\n", gid);
+        return;
+    }
+        
+    // Copy locally the compressed buffer and memset the destination buffer
+    for (uint32_t i=lid; i<cmp_buffer_size; i+=wg){
+        uint64_t read_pos = start_read + i;
+        if (read_pos<end_read)
+            local_cmp[i] = comp_src[read_pos];
+        else
+            local_cmp[i] = 0;            
+    }
+    for (uint32_t i=lid+cmp_buffer_size; i<LZ4_BLOCK_SIZE+LZ4_BLOCK_EXTRA; i+=wg){
+            local_cmp[i] = 0;
+    }
+    for (uint32_t i=lid; i<LZ4_BLOCK_SIZE; i+=wg){
+            local_dec[i] = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+        
+    //All the work is performed here:
+    uint32_t dec_size;
+    dec_size = lz4_decompress_local_block( local_cmp, local_dec, cmp_buffer_size, LZ4_BLOCK_SIZE);
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+    local uint8_t* local_buffer;
+    
+    //Perform bit-unshuffle
+    if (item_size == 1){
+//        if ((gid==0) && (lid==0)) printf("bitunshuffle8");
+        bitunshuffle8(local_dec, local_cmp, dec_size);
+        local_buffer=local_cmp;
+        }
+    else if (item_size == 2){
+//        if ((gid==0) && (lid==0)) printf("bitunshuffle16");
+        bitunshuffle16(local_dec, local_cmp, dec_size);
+        local_buffer=local_cmp;
+    }
+    else if (item_size == 4){
+//        if ((gid==0) && (lid==0)) printf("bitunshuffle32");
+        bitunshuffle32(local_dec, local_cmp, dec_size);
+        local_buffer=local_cmp;
+    }
+    else if (item_size == 8){
+//        if ((gid==0) && (lid==0)) printf("bitunshuffle64");
+        bitunshuffle64(local_dec, local_cmp, dec_size);
+        local_buffer=local_cmp;
+    }
+    else {
+        local_buffer = local_dec;
+    }
+        
+    //Finally copy the destination data from local to global memory:
+    uint64_t start_write = LZ4_BLOCK_SIZE*gid;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (uint32_t i=lid; i<dec_size; i+=wg){
+        dec_dest[start_write + i] = local_buffer[i];
+    }
+
+}
diff --git a/src/silx/resources/opencl/codec/byte_offset.cl b/src/silx/resources/opencl/codec/byte_offset.cl
new file mode 100644
index 0000000..56a24c4
--- /dev/null
+++ b/src/silx/resources/opencl/codec/byte_offset.cl
@@ -0,0 +1,235 @@
+/*
+ *   Project: SILX: A data analysis tool-kit
+ *
+ *   Copyright (C) 2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* To decompress CBF byte-offset compressed in parallel on GPU one needs to:
+ * - Set all values in mask and exception counter to zero.
+ * - Mark regions with exceptions and set values without exception.
+ *   This generates the values (zeros for exceptions), the exception mask,
+ *   counts the number of exception region and provides a start position for
+ *   each exception.
+ * - Treat exceptions. For this, one thread in a workgoup treats a complete
+ *   masked region in a serial fashion. All regions are treated in parallel.
+ *   Values written at this stage are marked in the mask with -1.
+ * - Double scan: inclusive cum sum for values, exclusive cum sum to generate
+ *   indices in output array. Values with mask = 1 are considered as 0.
+ * - Compact and copy output by removing duplicated values in exceptions.
+ */
+
+kernel void mark_exceptions(global char* raw,
+                            int size,
+                            int full_size,
+                            global int* mask,
+                            global int* values,
+                            global int* cnt,
+                            global int* exc)
+{
+    int gid;
+    gid = get_global_id(0);
+    if (gid<size)
+    {
+        int value, position;
+        value = raw[gid];
+        if (value == -128)
+        {
+            int maxi;
+            values[gid] = 0;
+            position = atomic_inc(cnt);
+            exc[position] = gid;
+            maxi = size - 1;
+            mask[gid] = 1;
+            mask[min(maxi, gid+1)] = 1;
+            mask[min(maxi, gid+2)] = 1;
+
+            if (((int) raw[min(gid+1, maxi)] == 0) &&
+                ((int) raw[min(gid+2, maxi)] == -128))
+            {
+                mask[min(maxi, gid+3)] = 1;
+                mask[min(maxi, gid+4)] = 1;
+                mask[min(maxi, gid+5)] = 1;
+                mask[min(maxi, gid+6)] = 1;
+            }
+        }
+        else
+        { // treat simple data
+
+            values[gid] = value;
+        }
+    }
+    else if (gid<full_size)
+    {
+        mask[gid]=1;
+        values[gid] = 0;
+    }
+}
+
+//run with WG=1, as may as exceptions
+kernel void treat_exceptions(global char* raw,  //raw compressed stream
+                             int size,          //size of the raw compressed stream
+                             global int* mask,  //tells if the value is masked
+                             global int* exc,   //array storing the position of the start of exception zones
+                             global int* values)// stores decompressed values.
+{
+    int gid = get_global_id(0);
+    int inp_pos = exc[gid];
+    if ((inp_pos<=0) || ((int)mask[inp_pos - 1] == 0))
+    {
+        int value, is_masked, next_value, inc;
+        is_masked = (mask[inp_pos] != 0);
+        while ((is_masked) && (inp_pos<size))
+        {
+            value = (int) raw[inp_pos];
+            if (value == -128)
+            { // this correspond to 16 bits exception
+                uchar low_byte = raw[inp_pos+1];
+                char high_byte = raw[inp_pos+2] ;
+                next_value = high_byte<<8 | low_byte;
+                if (next_value == -32768)
+                { // this correspond to 32 bits exception
+                    uchar low_byte1 = raw[inp_pos+3],
+                          low_byte2 = raw[inp_pos+4],
+                          low_byte3 = raw[inp_pos+5];
+                    char high_byte4 = raw[inp_pos+6] ;
+                    value = high_byte4<<24 | low_byte3<<16 | low_byte2<<8 | low_byte1;
+                    inc = 7;
+                }
+                else
+                {
+                    value = next_value;
+                    inc = 3;
+                }
+            }
+            else
+            {
+                inc = 1;
+            }
+            values[inp_pos] = value;
+            mask[inp_pos] = -1; // mark the processed data as valid in the mask
+            inp_pos += inc;
+            is_masked = (mask[inp_pos] != 0);
+        }
+    }
+}
+
+// copy the values of the elements to definitive position
+kernel void copy_result_int(global int* values,
+                            global int* indexes,
+                            int in_size,
+                            int out_size,
+                            global int* output
+                            )
+{
+    int gid = get_global_id(0);
+    if (gid < in_size)
+    {
+        int current = max(indexes[gid], 0),
+               next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1];
+        //we keep always the last element
+        if ((current <= out_size) && (current < next))
+        {
+            output[current] = values[gid];
+        }
+    }
+}
+
+// copy the values of the elements to definitive position
+kernel void copy_result_float(global int* values,
+                              global int* indexes,
+                              int in_size,
+                              int out_size,
+                              global float* output
+                              )
+{
+    int gid = get_global_id(0);
+    if (gid<in_size)
+    {
+        int current = max(indexes[gid], 0),
+               next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1];
+        if ((current < out_size) && (current < next))
+        {
+            output[current] = (float) values[gid];
+        }
+    }
+}
+
+
+// combined memset for all arrays used for Byte Offset decompression
+kernel void byte_offset_memset(global char* raw,
+                               global int* mask,
+                               global int* index,
+                               global int* result,
+                               int full_size,
+                               int actual_size
+                              )
+{
+    int gid = get_global_id(0);
+    if (gid < full_size)
+    {
+        raw[gid] = 0;
+        index[gid] = 0;
+        result[gid] = 0;
+        if (gid<actual_size)
+        {
+            mask[gid] = 0;
+        }
+        else
+        {
+            mask[gid] = 1;
+        }
+
+    }
+}
+
+
+//Simple memset kernel for char arrays
+kernel void fill_char_mem(global char* ary,
+                          int size,
+                          char pattern,
+                          int start_at)
+{
+    int gid = get_global_id(0);
+    if ((gid >= start_at) && (gid < size))
+    {
+        ary[gid] = pattern;
+    }
+}
+
+//Simple memset kernel for int arrays
+kernel void fill_int_mem(global int* ary,
+                         int size,
+                         int pattern,
+                         int start_at)
+{
+    int gid = get_global_id(0);
+    if ((gid >= start_at) && (gid < size))
+    {
+        ary[gid] = pattern;
+    }
+}
+
diff --git a/src/silx/resources/opencl/convolution.cl b/src/silx/resources/opencl/convolution.cl
new file mode 100644
index 0000000..629b8fc
--- /dev/null
+++ b/src/silx/resources/opencl/convolution.cl
@@ -0,0 +1,312 @@
+#define MAX_CONST_SIZE 16384
+
+/******************************************************************************/
+/**************************** Macros ******************************************/
+/******************************************************************************/
+
+// Get the center index of the filter,
+// and the "half-Left" and "half-Right" lengths.
+// In the case of an even-sized filter, the center is shifted to the left.
+#define GET_CENTER_HL(hlen){\
+        if (hlen & 1) {\
+            c = hlen/2;\
+            hL = c;\
+            hR = c;\
+        }\
+        else {\
+            c = hlen/2 - 1;\
+            hL = c;\
+            hR = c+1;\
+        }\
+}\
+
+// Boundary handling modes
+#define CONV_MODE_REFLECT 0 // cba|abcd|dcb
+#define CONV_MODE_NEAREST 1 // aaa|abcd|ddd
+#define CONV_MODE_WRAP 2 // bcd|abcd|abc
+#define CONV_MODE_CONSTANT 3 // 000|abcd|000
+#ifndef USED_CONV_MODE
+    #define USED_CONV_MODE CONV_MODE_NEAREST
+#endif
+
+#define CONV_PERIODIC_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x += Nx; if (idx_x >= Nx) idx_x -= Nx;
+#define CONV_PERIODIC_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y += Ny; if (idx_y >= Ny) idx_y -= Ny;
+#define CONV_PERIODIC_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z += Nz; if (idx_z >= Nz) idx_z -= Nz;
+
+#define CONV_NEAREST_IDX_X int idx_x = clamp((int) (gidx - c + jx), 0, Nx-1);
+#define CONV_NEAREST_IDX_Y int idx_y = clamp((int) (gidy - c + jy), 0, Ny-1);
+#define CONV_NEAREST_IDX_Z int idx_z = clamp((int) (gidz - c + jz), 0, Nz-1);
+
+#define CONV_REFLECT_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x = -idx_x-1; if (idx_x >= Nx) idx_x = Nx-(idx_x-(Nx-1));
+#define CONV_REFLECT_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y = -idx_y-1; if (idx_y >= Ny) idx_y = Ny-(idx_y-(Ny-1));
+#define CONV_REFLECT_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z = -idx_z-1; if (idx_z >= Nz) idx_z = Nz-(idx_z-(Nz-1));
+
+
+#if USED_CONV_MODE == CONV_MODE_REFLECT
+    #define CONV_IDX_X CONV_REFLECT_IDX_X
+    #define CONV_IDX_Y CONV_REFLECT_IDX_Y
+    #define CONV_IDX_Z CONV_REFLECT_IDX_Z
+#elif USED_CONV_MODE == CONV_MODE_NEAREST
+    #define CONV_IDX_X CONV_NEAREST_IDX_X
+    #define CONV_IDX_Y CONV_NEAREST_IDX_Y
+    #define CONV_IDX_Z CONV_NEAREST_IDX_Z
+#elif USED_CONV_MODE == CONV_MODE_WRAP
+    #define CONV_IDX_X CONV_PERIODIC_IDX_X
+    #define CONV_IDX_Y CONV_PERIODIC_IDX_Y
+    #define CONV_IDX_Z CONV_PERIODIC_IDX_Z
+#elif USED_CONV_MODE == CONV_MODE_CONSTANT
+    #error "constant not implemented yet"
+#else
+    #error "Unknown convolution mode"
+#endif
+
+
+
+// Image access patterns
+#define READ_IMAGE_1D_X input[(gidz*Ny + gidy)*Nx + idx_x]
+#define READ_IMAGE_1D_Y input[(gidz*Ny + idx_y)*Nx + gidx]
+#define READ_IMAGE_1D_Z input[(idx_z*Ny + gidy)*Nx + gidx]
+
+#define READ_IMAGE_2D_XY input[(gidz*Ny + idx_y)*Nx + idx_x]
+#define READ_IMAGE_2D_XZ input[(idx_z*Ny + gidy)*Nx + idx_x]
+#define READ_IMAGE_2D_YZ input[(idx_z*Ny + idx_y)*Nx + gidx]
+
+#define READ_IMAGE_3D_XYZ input[(idx_z*Ny + idx_y)*Nx + idx_x]
+
+
+
+/******************************************************************************/
+/**************************** 1D Convolution **********************************/
+/******************************************************************************/
+
+
+// Convolution with 1D kernel along axis "X" (fast dimension)
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "X".
+__kernel void convol_1D_X(
+    const __global float * input,
+    __global float * output,
+    __global float * filter,
+    int L, // filter size
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(L);
+    float sum = 0.0f;
+
+    for (int jx = 0; jx <= hR+hL; jx++) {
+        CONV_IDX_X; // Get index "x"
+        sum += READ_IMAGE_1D_X * filter[L-1 - jx];
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+// Convolution with 1D kernel along axis "Y"
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y".
+__kernel void convol_1D_Y(
+    const __global float * input,
+    __global float * output,
+    __global float * filter,
+    int L, // filter size
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(L);
+    float sum = 0.0f;
+
+    for (int jy = 0; jy <= hR+hL; jy++) {
+        CONV_IDX_Y; // Get index "y"
+        sum += READ_IMAGE_1D_Y * filter[L-1 - jy];
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+// Convolution with 1D kernel along axis "Z"
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z".
+__kernel void convol_1D_Z(
+    const __global float * input,
+    __global float * output,
+    __global float * filter,
+    int L, // filter size
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(L);
+    float sum = 0.0f;
+
+    for (int jz = 0; jz <= hR+hL; jz++) {
+        CONV_IDX_Z; // Get index "z"
+        sum += READ_IMAGE_1D_Z * filter[L-1 - jz];
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+/******************************************************************************/
+/**************************** 2D Convolution **********************************/
+/******************************************************************************/
+
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_XY(
+    const __global float * input,
+    __global float * output,
+    __global float * filter,
+    int Lx, // filter number of columns,
+    int Ly, // filter number of rows,
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Lx);
+    float sum = 0.0f;
+
+    for (int jy = 0; jy <= hR+hL; jy++) {
+        CONV_IDX_Y; // Get index "y"
+        for (int jx = 0; jx <= hR+hL; jx++) {
+            CONV_IDX_X; // Get index "x"
+            sum += READ_IMAGE_2D_XY * filter[(Ly-1-jy)*Lx + (Lx-1 - jx)];
+        }
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_XZ(
+    const __global float * input,
+    __global float * output,
+    __global float * filter,
+    int Lx, // filter number of columns,
+    int Lz, // filter number of rows,
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Lx);
+    float sum = 0.0f;
+
+    for (int jz = 0; jz <= hR+hL; jz++) {
+        CONV_IDX_Z; // Get index "z"
+        for (int jx = 0; jx <= hR+hL; jx++) {
+            CONV_IDX_X; // Get index "x"
+            sum += READ_IMAGE_2D_XZ * filter[(Lz-1-jz)*Lx + (Lx-1 - jx)];
+        }
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_YZ(
+    const __global float * input,
+    __global float * output,
+    __global float * filter,
+    int Ly, // filter number of columns,
+    int Lz, // filter number of rows,
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Ly);
+    float sum = 0.0f;
+
+    for (int jz = 0; jz <= hR+hL; jz++) {
+        CONV_IDX_Z; // Get index "z"
+        for (int jy = 0; jy <= hR+hL; jy++) {
+            CONV_IDX_Y; // Get index "y"
+            sum += READ_IMAGE_2D_YZ * filter[(Lz-1-jz)*Ly + (Ly-1 - jy)];
+        }
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+
+/******************************************************************************/
+/**************************** 3D Convolution **********************************/
+/******************************************************************************/
+
+// Convolution with 3D kernel
+__kernel void convol_3D_XYZ(
+    const __global float * input,
+    __global float * output,
+    __global float * filter,
+    int Lx, // filter number of columns,
+    int Ly, // filter number of rows,
+    int Lz, // filter number of rows,
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Lx);
+    float sum = 0.0f;
+
+    for (int jz = 0; jz <= hR+hL; jz++) {
+        CONV_IDX_Z; // Get index "z"
+        for (int jy = 0; jy <= hR+hL; jy++) {
+            CONV_IDX_Y; // Get index "y"
+            for (int jx = 0; jx <= hR+hL; jx++) {
+                CONV_IDX_X; // Get index "x"
+                sum += READ_IMAGE_3D_XYZ * filter[((Lz-1-jz)*Ly + (Ly-1-jy))*Lx + (Lx-1 - jx)];
+            }
+        }
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
diff --git a/src/silx/resources/opencl/convolution_textures.cl b/src/silx/resources/opencl/convolution_textures.cl
new file mode 100644
index 0000000..517a67c
--- /dev/null
+++ b/src/silx/resources/opencl/convolution_textures.cl
@@ -0,0 +1,374 @@
+/******************************************************************************/
+/**************************** Macros ******************************************/
+/******************************************************************************/
+
+// Error handling
+#ifndef IMAGE_DIMS
+    #error "IMAGE_DIMS must be defined"
+#endif
+#ifndef FILTER_DIMS
+    #error "FILTER_DIMS must be defined"
+#endif
+#if FILTER_DIMS > IMAGE_DIMS
+    #error "Filter cannot have more dimensions than image"
+#endif
+
+// Boundary handling modes
+#define CONV_MODE_REFLECT 0 // CLK_ADDRESS_MIRRORED_REPEAT : cba|abcd|dcb
+#define CONV_MODE_NEAREST 1 // CLK_ADDRESS_CLAMP_TO_EDGE : aaa|abcd|ddd
+#define CONV_MODE_WRAP 2 // CLK_ADDRESS_REPEAT : bcd|abcd|abc
+#define CONV_MODE_CONSTANT 3 // CLK_ADDRESS_CLAMP : 000|abcd|000
+#ifndef USED_CONV_MODE
+    #define USED_CONV_MODE CONV_MODE_NEAREST
+#endif
+#if USED_CONV_MODE == CONV_MODE_REFLECT
+    #define CLK_BOUNDARY CLK_ADDRESS_MIRRORED_REPEAT
+    #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE
+    #define USE_NORM_COORDS
+#elif USED_CONV_MODE == CONV_MODE_NEAREST
+    #define CLK_BOUNDARY CLK_ADDRESS_CLAMP_TO_EDGE
+    #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE
+#elif USED_CONV_MODE == CONV_MODE_WRAP
+    #define CLK_BOUNDARY CLK_ADDRESS_REPEAT
+    #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE
+    #define USE_NORM_COORDS
+#elif USED_CONV_MODE == CONV_MODE_CONSTANT
+    #define CLK_BOUNDARY CLK_ADDRESS_CLAMP
+    #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE
+#else
+    #error "Unknown convolution mode"
+#endif
+
+
+
+// Convolution index for filter
+#define FILTER_INDEX(j) (Lx - 1 - j)
+
+// Filter access patterns
+#define READ_FILTER_1D(j) read_imagef(filter, (int2) (FILTER_INDEX(j), 0)).x;
+#define READ_FILTER_2D(jx, jy) read_imagef(filter, (int2) (FILTER_INDEX(jx), FILTER_INDEX(jy))).x;
+#define READ_FILTER_3D(jx, jy, jz) read_imagef(filter, (int4) (FILTER_INDEX(jx), FILTER_INDEX(jy), FILTER_INDEX(jz), 0)).x;
+
+
+// Convolution index for image
+#ifdef USE_NORM_COORDS
+    #define IMAGE_INDEX_X (gidx*1.0f +0.5f - c + jx)/Nx
+    #define IMAGE_INDEX_Y (gidy*1.0f +0.5f - c + jy)/Ny
+    #define IMAGE_INDEX_Z (gidz*1.0f +0.5f - c + jz)/Nz
+    #define RET_TYPE_1 float
+    #define RET_TYPE_2 float2
+    #define RET_TYPE_4 float4
+    #define C_ZERO 0.5f
+    #define GIDX (gidx*1.0f + 0.5f)/Nx
+    #define GIDY (gidy*1.0f + 0.5f)/Ny
+    #define GIDZ (gidz*1.0f + 0.5f)/Nz
+#else
+    #define IMAGE_INDEX_X (gidx - c + jx)
+    #define IMAGE_INDEX_Y (gidy - c + jy)
+    #define IMAGE_INDEX_Z (gidz - c + jz)
+    #define RET_TYPE_1 int
+    #define RET_TYPE_2 int2
+    #define RET_TYPE_4 int4
+    #define C_ZERO 0
+    #define GIDX gidx
+    #define GIDY gidy
+    #define GIDZ gidz
+#endif
+
+static const sampler_t sampler = CLK_COORDS | CLK_BOUNDARY | CLK_FILTER_NEAREST;
+
+// Image access patterns
+#define READ_IMAGE_1D read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, C_ZERO)).x
+
+#define READ_IMAGE_2D_X read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X , GIDY)).x
+#define READ_IMAGE_2D_Y read_imagef(input, sampler, (RET_TYPE_2) (GIDX, IMAGE_INDEX_Y)).x
+#define READ_IMAGE_2D_XY read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, IMAGE_INDEX_Y)).x
+
+#define READ_IMAGE_3D_X read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, GIDZ, C_ZERO)).x
+#define READ_IMAGE_3D_Y read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x
+#define READ_IMAGE_3D_Z read_imagef(input, sampler, (RET_TYPE_4) (GIDX, GIDY, IMAGE_INDEX_Z, C_ZERO)).x
+#define READ_IMAGE_3D_XY read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x
+#define READ_IMAGE_3D_XZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, IMAGE_INDEX_Z, C_ZERO)).x
+#define READ_IMAGE_3D_YZ read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x
+#define READ_IMAGE_3D_XYZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x
+
+
+// NOTE: pyopencl and OpenCL < 1.2 do not support image1d_t
+#if FILTER_DIMS == 1
+    #define FILTER_TYPE image2d_t
+    #define READ_FILTER_VAL(j) READ_FILTER_1D(j)
+#elif FILTER_DIMS == 2
+    #define FILTER_TYPE image2d_t
+    #define READ_FILTER_VAL(jx, jy) READ_FILTER_2D(jx, jy)
+#elif FILTER_DIMS == 3
+    #define FILTER_TYPE image3d_t
+    #define READ_FILTER_VAL(jx, jy, jz) READ_FILTER_3D(jx, jy, jz)
+#endif
+
+#if IMAGE_DIMS == 1
+    #define IMAGE_TYPE image2d_t
+    #define READ_IMAGE_X READ_IMAGE_1D
+#elif IMAGE_DIMS == 2
+    #define IMAGE_TYPE image2d_t
+    #define READ_IMAGE_X READ_IMAGE_2D_X
+    #define READ_IMAGE_Y READ_IMAGE_2D_Y
+    #define READ_IMAGE_XY READ_IMAGE_2D_XY
+#elif IMAGE_DIMS == 3
+    #define IMAGE_TYPE image3d_t
+    #define READ_IMAGE_X READ_IMAGE_3D_X
+    #define READ_IMAGE_Y READ_IMAGE_3D_Y
+    #define READ_IMAGE_Z READ_IMAGE_3D_Z
+    #define READ_IMAGE_XY READ_IMAGE_3D_XY
+    #define READ_IMAGE_XZ READ_IMAGE_3D_XZ
+    #define READ_IMAGE_YZ READ_IMAGE_3D_YZ
+    #define READ_IMAGE_XYZ READ_IMAGE_3D_XYZ
+#endif
+
+
+// Get the center index of the filter,
+// and the "half-Left" and "half-Right" lengths.
+// In the case of an even-sized filter, the center is shifted to the left.
+#define GET_CENTER_HL(hlen){\
+        if (hlen & 1) {\
+            c = hlen/2;\
+            hL = c;\
+            hR = c;\
+        }\
+        else {\
+            c = hlen/2 - 1;\
+            hL = c;\
+            hR = c+1;\
+        }\
+}\
+
+
+
+/******************************************************************************/
+/**************************** 1D Convolution **********************************/
+/******************************************************************************/
+
+#if FILTER_DIMS == 1
+// Convolution with 1D kernel along axis "X" (fast dimension)
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "X".
+__kernel void convol_1D_X_tex(
+    read_only IMAGE_TYPE input,
+    __global float * output,
+    read_only FILTER_TYPE filter,
+    int Lx, // filter size
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Lx);
+    float sum = 0.0f;
+
+    for (int jx = 0; jx <= hR+hL; jx++) {
+        sum += READ_IMAGE_X * READ_FILTER_VAL(jx);
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+#if IMAGE_DIMS >= 2
+// Convolution with 1D kernel along axis "Y"
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y".
+__kernel void convol_1D_Y_tex(
+    read_only IMAGE_TYPE input,
+    __global float * output,
+    read_only FILTER_TYPE filter,
+    int Lx, // filter size
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Lx);
+    float sum = 0.0f;
+
+    for (int jy = 0; jy <= hR+hL; jy++) {
+        sum += READ_IMAGE_Y * READ_FILTER_VAL(jy);
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+#endif
+
+#if IMAGE_DIMS == 3
+// Convolution with 1D kernel along axis "Z"
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z".
+__kernel void convol_1D_Z_tex(
+    read_only IMAGE_TYPE input,
+    __global float * output,
+    read_only FILTER_TYPE filter,
+    int Lx, // filter size
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Lx);
+    float sum = 0.0f;
+
+    for (int jz = 0; jz <= hR+hL; jz++) {
+        sum += READ_IMAGE_Z * READ_FILTER_VAL(jz);
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+#endif
+#endif
+
+/******************************************************************************/
+/**************************** 2D Convolution **********************************/
+/******************************************************************************/
+
+#if IMAGE_DIMS >= 2 && FILTER_DIMS == 2
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_XY_tex(
+    read_only IMAGE_TYPE input,
+    __global float * output,
+    read_only FILTER_TYPE filter,
+    int Lx, // filter number of columns,
+    int Ly, // filter number of rows,
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Lx);
+    float sum = 0.0f;
+
+    for (int jy = 0; jy <= hR+hL; jy++) {
+        for (int jx = 0; jx <= hR+hL; jx++) {
+            sum += READ_IMAGE_XY * READ_FILTER_VAL(jx, jy);
+        }
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+#endif
+
+#if IMAGE_DIMS == 3 && FILTER_DIMS == 2
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_XZ_tex(
+    read_only IMAGE_TYPE input,
+    __global float * output,
+    read_only FILTER_TYPE filter,
+    int Lx, // filter number of columns,
+    int Lz, // filter number of rows,
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Lx);
+    float sum = 0.0f;
+
+    for (int jz = 0; jz <= hR+hL; jz++) {
+        for (int jx = 0; jx <= hR+hL; jx++) {
+            sum += READ_IMAGE_XZ * READ_FILTER_VAL(jx, jz);
+        }
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_YZ_tex(
+    read_only IMAGE_TYPE input,
+    __global float * output,
+    read_only FILTER_TYPE filter,
+    int Lx, // filter number of columns,
+    int Lz, // filter number of rows,
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Lx);
+    float sum = 0.0f;
+
+    for (int jz = 0; jz <= hR+hL; jz++) {
+        for (int jy = 0; jy <= hR+hL; jy++) {
+            sum += READ_IMAGE_YZ * READ_FILTER_VAL(jy, jz);
+        }
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+#endif
+
+
+/******************************************************************************/
+/**************************** 3D Convolution **********************************/
+/******************************************************************************/
+
+#if IMAGE_DIMS == 3 && FILTER_DIMS == 3
+// Convolution with 3D kernel
+__kernel void convol_3D_XYZ_tex(
+    read_only IMAGE_TYPE input,
+    __global float * output,
+    read_only FILTER_TYPE filter,
+    int Lx, // filter number of columns,
+    int Ly, // filter number of rows,
+    int Lz, // filter number of rows,
+    int Nx, // input/output number of columns
+    int Ny, // input/output number of rows
+    int Nz  // input/output depth
+)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+    int c, hL, hR;
+    GET_CENTER_HL(Lx);
+    float sum = 0.0f;
+
+    for (int jz = 0; jz <= hR+hL; jz++) {
+        for (int jy = 0; jy <= hR+hL; jy++) {
+            for (int jx = 0; jx <= hR+hL; jx++) {
+                sum += READ_IMAGE_XYZ * READ_FILTER_VAL(jx, jy, jz);
+            }
+        }
+    }
+    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+#endif
diff --git a/src/silx/resources/opencl/doubleword.cl b/src/silx/resources/opencl/doubleword.cl
new file mode 100644
index 0000000..02a8aba
--- /dev/null
+++ b/src/silx/resources/opencl/doubleword.cl
@@ -0,0 +1,122 @@
+/*
+ * OpenCL library for double word floating point calculation using compensated arithmetics
+ *
+ * The theoritical basis can be found in Valentina Popescu's PhD thesis:
+ * Towards fast and certified multi-precision libraries
+ * Reference LYSEN036
+ * http://www.theses.fr/2017LYSEN036
+ * All page number and equation number are refering to this document. 
+ * 
+ * The precision of the calculation (bounds) is provided in ULP (smallest possible mantissa) 
+ * and come from the table 2.2 (page 68 of the thesis).
+ * The number of equivalent FLOP is taken from the table 2.3 (page 69 the thesis). 
+ * Note that FLOP are not all equal: a division is much more expensive than an addition.
+ */
+
+//This library can be expanded to double-double by redefining fp, fp2 and one to double, double2 and 1.0.
+#ifdef DOUBLEDOUBLE
+#define fp double
+#define fp2 double2
+#define one 1.0
+#else
+#define fp float
+#define fp2 float2
+#define one 1.0f
+#endif
+
+/* Nota: i386 computer use x87 registers which are larger than the 32bits precision
+ * which can invalidate the error compensation mechanism.
+ *
+ * We use the trick to declare some variable "volatile" to enforce the actual
+ * precision reduction of those variables.
+ * This has to be used in combination with #pragma clang fp contract(on)
+*/
+
+#ifndef X87_VOLATILE
+# define X87_VOLATILE
+#endif
+
+//Algorithm 1, p23, theorem 1.1.12. Requires e_x > e_y, valid if |x| > |y| 
+inline fp2 fast_fp_plus_fp(fp x, fp y){
+    #pragma clang fp contract(on)
+    X87_VOLATILE fp s = x + y;
+    X87_VOLATILE fp z = s - x;
+    fp e = y - z;
+    return (fp2)(s, e);
+}
+
+//Algorithm 2, p24, same as fast_fp_plus_fp without the condition on e_x and e_y
+inline fp2 fp_plus_fp(fp x, fp y){
+    #pragma clang fp contract(on)
+    X87_VOLATILE fp s = x + y;
+    X87_VOLATILE fp xp = s - y;
+    X87_VOLATILE fp yp = s - xp;
+    X87_VOLATILE fp dx = x - xp;
+    X87_VOLATILE fp dy = y - yp;
+    return (fp2)(s, dx+dy);  
+}
+
+//Algorithm 3, p24: multiplication with a FMA
+inline fp2 fp_times_fp(fp x, fp y){
+    fp p = x * y;
+    fp e = fma(x, y, -p);
+    return (fp2)(p, e);  
+}
+
+//Algorithm 7, p38: Addition of a FP to a DW. 10flop bounds:2u²+5u³
+inline fp2 dw_plus_fp(fp2 x, fp y){
+    #pragma clang fp contract(on)
+    fp2 s = fp_plus_fp(x.s0, y);
+    X87_VOLATILE fp v = x.s1 + s.s1;
+    return fast_fp_plus_fp(s.s0, v);
+}
+
+//Algorithm 9, p40: addition of two DW: 20flop bounds:3u²+13u³
+inline fp2 dw_plus_dw(fp2 x, fp2 y){
+    fp2 s = fp_plus_fp(x.s0, y.s0);
+    fp2 t = fp_plus_fp(x.s1, y.s1);
+    fp2 v = fast_fp_plus_fp(s.s0, s.s1 + t.s0);
+    return fast_fp_plus_fp(v.s0, t.s1 + v.s1);
+}
+
+//Algorithm 12, p49: Multiplication FP*DW: 6flops bounds:2u²
+inline fp2 dw_times_fp(fp2 x, fp y){
+    fp2 c = fp_times_fp(x.s0, y);
+    return fast_fp_plus_fp(c.s0, fma(x.s1, y, c.s1));
+}
+
+//Algorithm 14, p52: Multiplication DW*DW, 8 flops bounds:6u²
+inline fp2 dw_times_dw(fp2 x, fp2 y){
+    #pragma clang fp contract(on)
+    fp2 c = fp_times_fp(x.s0, y.s0);
+    X87_VOLATILE fp l = fma(x.s1, y.s0, x.s0 * y.s1);
+    return fast_fp_plus_fp(c.s0, c.s1 + l);
+}
+
+//Algorithm 17, p55: Division DW / FP, 10flops bounds: 3.5u²
+inline fp2 dw_div_fp(fp2 x, fp y){
+    #pragma clang fp contract(on)
+    X87_VOLATILE fp th = x.s0 / y;
+    fp2 pi = fp_times_fp(th, y);
+    fp2 d = x - pi;
+    X87_VOLATILE fp delta = d.s0 + d.s1;
+    X87_VOLATILE fp tl = delta/y; 
+    return fast_fp_plus_fp(th, tl);
+}
+
+//Derived from algorithm 20, p64: Inversion 1/ DW, 22 flops
+inline fp2 inv_dw(fp2 y){
+    #pragma clang fp contract(on)
+    X87_VOLATILE fp th = one/y.s0;
+    X87_VOLATILE fp rh = fma(-y.s0, th, one);
+    X87_VOLATILE fp rl = -y.s1 * th;
+    fp2 e = fast_fp_plus_fp(rh, rl);
+    fp2 delta = dw_times_fp(e, th);
+    return dw_plus_fp(delta, th);
+}
+    
+//Algorithm 20, p64: Division DW / DW, 30 flops: bounds:9.8u²
+inline fp2 dw_div_dw(fp2 x, fp2 y){
+    return dw_times_dw(x, inv_dw(y));
+}
+
diff --git a/src/silx/resources/opencl/image/cast.cl b/src/silx/resources/opencl/image/cast.cl
new file mode 100644
index 0000000..9e23a82
--- /dev/null
+++ b/src/silx/resources/opencl/image/cast.cl
@@ -0,0 +1,181 @@
+/*
+ *   Project: SILX: Alogorithms for image processing
+ *
+ *   Copyright (C) 2013-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NB_COLOR
+    #define NB_COLOR 1
+#endif
+
+
+/**
+ * \brief Cast values of an array of uint8 into a float output array.
+ *
+ * :param array_input:   Pointer to global memory with the input data as unsigned8 array
+ * :param array_float:   Pointer to global memory with the output data as float array
+ * :param width:         Width of the image
+ * :param height:        Height of the image
+ */
+kernel void u8_to_float( global unsigned char  *array_input,
+                         global float *array_float,
+                         const int width,
+                         const int height)
+{
+    //Global memory guard for padding
+    if ((get_global_id(0) < width) && (get_global_id(1)<height))
+    {
+        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+        for (int c=0; c<NB_COLOR; c++)
+        {
+            array_float[i + c] = (float) array_input[i + c];
+        } //end loop over colors
+    } //end test in image
+} //end kernel
+
+/**
+ * \brief Cast values of an array of uint8 into a float output array.
+ *
+ * :param array_input:   Pointer to global memory with the input data as signed8 array
+ * :param array_float:   Pointer to global memory with the output data as float array
+ * :param width:         Width of the image
+ * :param height:        Height of the image
+ */
+kernel void s8_to_float( global char  *array_input,
+                         global float *array_float,
+                         const int width,
+                         const int height)
+{
+    //Global memory guard for padding
+    if ((get_global_id(0) < width) && (get_global_id(1)<height))
+    {
+        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+        for (int c=0; c<NB_COLOR; c++)
+        {
+            array_float[i + c] = (float) array_input[i + c];
+        } //end loop over colors
+    } //end test in image
+} //end kernel
+
+
+/**
+ * \brief cast values of an array of uint16 into a float output array.
+ *
+ * :param array_input:    Pointer to global memory with the input data as unsigned16 array
+ * :param array_float:    Pointer to global memory with the output data as float array
+ * :param width:          Width of the image
+ * :param height:         Height of the image
+ */
+kernel void u16_to_float(global unsigned short  *array_input,
+                         global float *array_float,
+                         const int width,
+                         const int height)
+{
+    //Global memory guard for padding
+    if ((get_global_id(0) < width) && (get_global_id(1)<height))
+    {
+        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+        for (int c=0; c<NB_COLOR; c++)
+        {
+            array_float[i + c] = (float) array_input[i + c];
+        }           //end loop over colors
+    } //end test in image
+} //end kernel
+
+/**
+ * \brief cast values of an array of int16 into a float output array.
+ *
+ * :param array_input:    Pointer to global memory with the input data as signed16 array
+ * :param array_float:    Pointer to global memory with the output data as float array
+ * :param width:          Width of the image
+ * :param height:         Height of the image
+ */
+kernel void s16_to_float(global short *array_input,
+                         global float *array_float,
+                         const int width,
+                         const int height)
+{
+    //Global memory guard for padding
+    if ((get_global_id(0) < width) && (get_global_id(1)<height))
+    {
+        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+        for (int c=0; c<NB_COLOR; c++)
+        {
+            array_float[i + c] = (float) array_input[i + c];
+        } //end loop over colors
+    } //end test in image
+}//end kernel
+
+/**
+ * \brief cast values of an array of uint32 into a float output array.
+ *
+ * :param array_input:    Pointer to global memory with the input data as unsigned32 array
+ * :param array_float:    Pointer to global memory with the output data as float array
+ * :param width:          Width of the image
+ * :param height:         Height of the image
+ */
+kernel void u32_to_float(global unsigned int  *array_input,
+                         global float *array_float,
+                         const int width,
+                         const int height)
+{
+    //Global memory guard for padding
+    if ((get_global_id(0) < width) && (get_global_id(1)<height))
+    {
+        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+        for (int c=0; c<NB_COLOR; c++)
+        {
+            array_float[i + c] = (float) array_input[i + c];
+        } //end loop over colors
+    } //end test in image
+}//end kernel
+
+
+/**
+ * \brief convert values of an array of int32 into a float output array.
+ *
+ * :param array_input:    Pointer to global memory with the data in int
+ * :param array_float:    Pointer to global memory with the data in float
+ * :param width:          Width of the image
+ * :param height:         Height of the image
+ */
+kernel void s32_to_float(global int  *array_input,
+                         global float  *array_float,
+                         const int width,
+                         const int height)
+{
+    //Global memory guard for padding
+    if ((get_global_id(0) < width) && (get_global_id(1)<height))
+    {
+        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+        for (int c=0; c<NB_COLOR; c++)
+        {
+            array_float[i + c] = (float) array_input[i + c];
+        } //end loop over colors
+    } //end test in image
+}//end kernel
+
diff --git a/src/silx/resources/opencl/image/histogram.cl b/src/silx/resources/opencl/image/histogram.cl
new file mode 100644
index 0000000..7fb1485
--- /dev/null
+++ b/src/silx/resources/opencl/image/histogram.cl
@@ -0,0 +1,178 @@
+//CL//
+
+/*
+ *   Project: SILX: Alogorithms for image processing
+ *
+ *   Copyright (C) 2013-2018 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* Single kernel histogram for float array.
+ *
+ * Can perform histograming in log-scale (using the arcsinh)
+
+Parameters:
+  - data: buffer with the image content in float (input)
+  - data_size: input
+  - mini: Lower bound of the first bin
+  - maxi: upper bouns of the last bin
+  - map_operation: Type of pre-processor to use. if if set to !=0, use log scale
+  - hist: resulting histogram (ouptut)
+  - hist_size: number of bins
+  - tmp_hist: temporary storage of size  hist_size*num_groups
+  - processed: temporary storage of size 1 initialially set to 0
+  - local_hist: local storage of size hist_size
+
+
+Grid information:
+    * use the largest WG size. If it is larger than the number of bins (hist_size),
+    take the power of 2 immediately above
+    *Schedule as many WG as the device has compute engines. No need to schuedule more,
+    it is just a waist of memory
+    * The histogram should fit in shared (local) memory and tmp_hist can be large!
+
+Assumes:
+    hist and local_hist have size hist_size
+    edges has size hist_size+2
+    tmp_hist has size hist_size*num_groups
+    processed is of size one and initially set to 0
+
+*/
+
+
+static float preprocess(float value, int code)
+{
+    //This function can be modified to have more scales
+    if (code!=0)
+    {
+        return asinh(value);
+    }
+    else
+    {
+        return value;
+    }
+}
+
+kernel void histogram(global float *data,
+                      int data_size,
+                      float mini,
+                      float maxi,
+                      int map_operation,
+                      global int *hist,
+                      global float *edges,
+                      int hist_size,
+                      global int *tmp_hist,
+                      global int *processed,
+                      local int *local_hist)
+{
+    // each thread
+    int lid = get_local_id(0);
+    int ws = get_local_size(0);
+    int nb_engine = get_num_groups(0);
+    int engine_id = get_group_id(0);
+
+    // memset the local_hist array
+    for (int i=0; i<hist_size; i+=ws)
+    {
+        int j = i + lid;
+        if (j<hist_size)
+        {
+            local_hist[j] = 0;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Process the local data and accumulate in shared memory
+    int bloc_size = (int) ceil((float)data_size/(float)nb_engine);
+    int start = bloc_size * engine_id;
+    int stop = min(start + bloc_size, data_size);
+    float vmini = preprocess(mini, map_operation);
+    float vscale = (float)hist_size/(preprocess(maxi, map_operation) -vmini);
+    for (int i = start; i<stop; i+=ws)
+    {
+        int address = i + lid;
+        if (address < stop)
+        {
+            float value = data[address];
+            if ((!isnan(value)) && (value>=mini) && (value<=maxi))
+            {
+                float vvalue = (preprocess(value, map_operation)-vmini)*vscale;
+                int idx = clamp((int) vvalue, 0, hist_size - 1);
+                atomic_inc(&local_hist[idx]);
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //Now copy result into the right place and reset the first value of the shared array
+    for (int i=0; i<hist_size; i+=ws)
+    {
+        int j = i + lid;
+        if (j<hist_size)
+        {
+            tmp_hist[hist_size * engine_id + j] = local_hist[j];
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    local_hist[0] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //Increment the system wide shared variable processed and share the result with the workgroup
+    if (lid == 0)
+    {
+        local_hist[0] = atomic_inc(processed);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // If we are the engine last to work, perform the concatenation of all results
+
+    if ((local_hist[0] + 1) == nb_engine)
+    {
+        for (int i=0; i<hist_size; i+=ws)
+        {
+            int j = i + lid;
+            int lsum = 0;
+            if (j<hist_size)
+            {
+                for (int k=0; k<nb_engine; k++)
+                {
+                    lsum += tmp_hist[hist_size * k + j];
+                }
+                hist[j] = lsum;
+                edges[j] = vmini + j/vscale;
+            }
+        }
+        // Finally reset the counter
+        if (lid == 0)
+        {
+            processed[0] = 0;
+            edges[hist_size] = vmini + hist_size/vscale;;
+        }
+
+    }
+}
diff --git a/src/silx/resources/opencl/image/map.cl b/src/silx/resources/opencl/image/map.cl
new file mode 100644
index 0000000..804a5a1
--- /dev/null
+++ b/src/silx/resources/opencl/image/map.cl
@@ -0,0 +1,85 @@
+/*
+ *   Project: SILX: Data analysis library
+ *            kernel for maximum and minimum calculation
+ *
+ *
+ *   Copyright (C) 2013-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ */
+
+#ifndef NB_COLOR
+    #define NB_COLOR 1
+#endif
+
+
+/**
+ * \brief Linear scale of signal in an image (maybe multi-color)
+ *
+ * :param image: contains the input image
+ * :param output: contains the output image after scaling
+ * :param max_min_input: 2-floats containing the maximum and the minimum value of image
+ * :param minimum: float scalar with the minimum of the output
+ * :param maximum: float scalar with the maximum of the output
+ * :param width: integer, number of columns the matrices
+ * :param height: integer, number of lines of the matrices
+ *
+ *
+ */
+
+
+kernel void normalize_image(global float* image,
+                            global float* output,
+                            int width,
+                            int height,
+                            global float* max_min_input,
+                            float minimum,
+                            float maximum
+                            )
+{
+    //Global memory guard for padding
+    if((get_global_id(0) < width) && (get_global_id(1)<height))
+    {
+        int idx = NB_COLOR* (get_global_id(0) + width * get_global_id(1));
+        float mini_in, maxi_in, scale;
+        maxi_in = max_min_input[0];
+        mini_in = max_min_input[1];
+        if (maxi_in == mini_in)
+        {
+            scale = NAN;
+        }
+        else
+        {
+            scale = (maximum - minimum) / (maxi_in - mini_in);
+        }
+
+        for (int c=0; c<NB_COLOR; c++)
+        {
+            output[idx+c] = fma(scale, image[idx+c]-mini_in, minimum);
+        } // end color loop
+    }//end if in IMAGE
+}//end kernel
diff --git a/src/silx/resources/opencl/image/max_min.cl b/src/silx/resources/opencl/image/max_min.cl
new file mode 100644
index 0000000..246cd48
--- /dev/null
+++ b/src/silx/resources/opencl/image/max_min.cl
@@ -0,0 +1,207 @@
+/*
+ *   Project: SILX: Data analysis library
+ *            kernel for maximum and minimum calculation
+ *
+ *
+ *   Copyright (C) 2013-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ */
+
+#ifndef NB_COLOR
+    #define NB_COLOR 1
+#endif
+
+
+
+#define REDUCE(a, b) ((float2) (fmax(a.x, b.x), fmin(a.y, b.y)))
+
+static float2 read_and_map(int idx,
+                           global float* data)
+{
+    idx *= NB_COLOR;
+    float tmp = data[idx];
+    float2 res = (float2) (tmp, tmp);
+    if (NB_COLOR > 1)
+    {
+        for (int c=1; c<NB_COLOR; c++)
+        {
+            tmp = data[idx + c];
+            res = (float2) (fmax(res.x, tmp), fmin(res.y, tmp));
+        }
+    }
+    return res;
+}
+
+
+/**
+ * \brief max_min_global_stage1: Look for the maximum an the minimum of an array. stage1
+ *
+ * optimal workgroup size: 2^n greater than sqrt(size), limited to 512
+ * optimal total item size:  (workgroup size)^2
+ * if size >total item size: adjust seq_count.
+ *
+ * :param data:       Float pointer to global memory storing the vector of data.
+ * :param out:        Float2 pointer to global memory storing the temporary results (workgroup size)
+ * :param seq_count:  how many blocksize each thread should read
+ * :param size:       size of the problem (number of element in the image
+ * :param l_data      Shared memory: 2 float per thread in workgroup
+ *
+**/
+
+
+kernel void max_min_reduction_stage1( global const float *data,
+                                      global float2 *out,
+                                      int size,
+                                      local  float2 *l_data)// local storage 2 float per thread
+{
+    int group_size =  get_local_size(0);
+    int lid = get_local_id(0);
+    float2 acc;
+    int big_block = group_size * get_num_groups(0);
+    int i =  lid + group_size * get_group_id(0);
+
+    if (lid<size)
+        acc = read_and_map(lid, data);
+    else
+        acc = read_and_map(0, data);
+
+    // Linear pre-reduction stage 0
+
+    while (i<size){
+      acc = REDUCE(acc, read_and_map(i, data));
+      i += big_block;
+    }
+
+    // parallel reduction stage 1
+
+    l_data[lid] = acc;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int block=group_size/2; block>1; block/=2)
+        {
+            if ((lid < block) && ((lid + block)<group_size)){
+                l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+    if (lid == 0)
+    {
+        if (group_size > 1)
+        {
+            acc = REDUCE(l_data[0], l_data[1]);
+        }
+        else
+        {
+            acc = l_data[0];
+        }
+        out[get_group_id(0)] = acc;
+    }
+}
+
+
+/**
+ * \brief global_max_min: Look for the maximum an the minimum of an array.
+ *
+ *
+ *
+ * :param data2:      Float2 pointer to global memory storing the vector of pre-reduced data (workgroup size).
+ * :param maximum:    Float pointer to global memory storing the maximum value
+ * :param minumum:    Float pointer to global memory storing the minimum value
+ * :param l_data      Shared memory: 2 float per thread in workgroup
+ *
+**/
+
+kernel void max_min_reduction_stage2(
+        global const float2 *data2,
+        global float2 *maxmin,
+        local  float2 *l_data)// local storage 2 float per thread
+{
+    int lid = get_local_id(0);
+    int group_size =  get_local_size(0);
+    float2 acc = (float2)(-1.0f, -1.0f);
+    if (lid<=group_size)
+    {
+        l_data[lid] = data2[lid];
+    }
+    else
+    {
+        l_data[lid] = acc;
+    }
+
+    // parallel reduction stage 2
+
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int block=group_size/2; block>1; block/=2)
+    {
+        if ((lid < block) && ((lid + block)<group_size))
+        {
+            l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+    }
+
+    if (lid == 0  )
+    {
+        if ( group_size > 1)
+        {
+            acc = REDUCE(l_data[0], l_data[1]);
+        }
+        else
+        {
+            acc = l_data[0];
+        }
+        maxmin[0] = acc;
+    }
+}
+
+/*This is the serial version of the min_max kernel.
+ *
+ * It has to be launched with WG=1 and only 1 WG has to be launched !
+ *
+ * :param data:       Float pointer to global memory storing the vector of data.
+ * :param size:       size of the
+ * :param maximum:    Float pointer to global memory storing the maximum value
+ * :param minumum:    Float pointer to global memory storing the minimum value
+ *
+ *
+ */
+kernel void max_min_serial(
+                            global const float *data,
+                            unsigned int size,
+                            global float2 *maxmin
+                            )
+{
+    float2 acc = read_and_map(0, data);
+    for (int i=1; i<size; i++)
+    {
+        acc = REDUCE(acc, read_and_map(i, data));
+    }
+
+    maxmin[0] = acc;
+}
diff --git a/src/silx/resources/opencl/kahan.cl b/src/silx/resources/opencl/kahan.cl
new file mode 100644
index 0000000..c23d84d
--- /dev/null
+++ b/src/silx/resources/opencl/kahan.cl
@@ -0,0 +1,143 @@
+/*
+ * OpenCL library for 32-bits floating point calculation using compensated arithmetics
+ */
+
+/* Nota: i386 computer use x87 registers which are larger than the 32bits precision
+ * which can invalidate the error compensation mechanism.
+ *
+ * We use the trick to declare some variable "volatile" to enforce the actual
+ * precision reduction of those variables.
+*/
+
+#ifndef X87_VOLATILE
+# define X87_VOLATILE
+#endif
+
+// calculate acc.s0+value with error compensation
+// see https://en.wikipedia.org/wiki/Kahan_summation_algorithm
+// unlike wikipedia, here the exact value = acc.s0 + acc.s1 (performed in higher precision)
+
+static inline float2 kahan_sum(float2 acc, float value)
+{
+    if (value)
+    {
+        float sum = acc.s0;
+        float err = acc.s1;
+        if (fabs(value) > fabs(sum))
+        {
+            float tmp = sum;
+            sum = value;
+            value = tmp;
+        }
+
+        float cor = value + err;
+        X87_VOLATILE float target = sum + cor;
+        err = cor - (target - sum);
+        return (float2)(target, err);
+    }
+    else
+    {
+        return acc;
+    }
+}
+
+// calculate a*b with error compensation
+// see https://hal.archives-ouvertes.fr/hal-01367769/document
+static inline float2 comp_prod(float a, float b)
+{
+    X87_VOLATILE float x = a * b;
+    float y = fma(a, b, -x);
+    return (float2)(x, y);
+}
+
+// calculate a + b with error compensation
+static inline float2 compensated_sum(float2 a, float2 b)
+{
+    float err = a.s1 + b.s1;
+    float first = a.s0;
+    float second = b.s0;
+    if (fabs(second) > fabs(first))
+    {//swap first and second
+        float tmp = first;
+        first = second;
+        second = tmp;
+    }
+    float cor = second + err;
+    X87_VOLATILE float target = first + cor;
+    err = cor - (target - first);
+    return (float2)(target, err);
+}
+
+// calculate a * b with error compensation
+static inline float2 compensated_mul(float2 a, float2 b)
+{
+    float2 tmp;
+    tmp = kahan_sum((float2)(a.s0*b.s0, a.s0*b.s1), a.s1*b.s0);
+    tmp = kahan_sum(tmp, a.s1*b.s1);
+    return tmp;
+}
+
+// calculate 1/a  with error compensation (Needs validation)
+static inline float2 compensated_inv(float2 a)
+{
+    float main = a.s0;
+    float err  = a.s1;
+    return (float2)(1.0f/main, -err/(main*main));
+}
+
+// calculate a/b  with error compensation (Needs validation)
+static inline float2 compensated_div(float2 a, float2 b)
+{
+    float ah = a.s0;
+    float al  = a.s1;
+    float bh = b.s0;
+    float bl  = b.s1;
+    float bl_over_bh = bl/bh;
+    return kahan_sum(kahan_sum(a, -a.s1 * bl_over_bh), -a.s0 * bl_over_bh) / bh;
+}
+
+
+// calculate a.b where a and b are float2
+static inline float2 comp_dot2(float2 a, float2 b)
+{
+    return compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1));
+}
+
+// calculate a.b where a and b are float3
+static inline float2 comp_dot3(float3 a, float3 b)
+{
+    return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), comp_prod(a.s2, b.s2));
+}
+
+// calculate a.b where a and b are float4
+static inline float2 comp_dot4(float4 a, float4 b)
+{
+    return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
+                           compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3)));
+}
+
+// calculate a.b where a and b are float8
+static inline float2 comp_dot8(float8 a, float8 b)
+{
+    return compensated_sum(
+            compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
+                           compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))),
+            compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)),
+                           compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7))));
+}
+
+// calculate a.b where a and b are float8
+static inline float2 comp_dot16(float16 a, float16 b)
+{
+    return compensated_sum(
+             compensated_sum(
+               compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
+                               compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))),
+               compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)),
+                               compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7)))),
+             compensated_sum(
+               compensated_sum(compensated_sum(comp_prod(a.s8, b.s8), comp_prod(a.s9, b.s9)),
+                               compensated_sum(comp_prod(a.sa, b.sa), comp_prod(a.sb, b.sb))),
+               compensated_sum(compensated_sum(comp_prod(a.sc, b.sc), comp_prod(a.sd, b.sd)),
+                               compensated_sum(comp_prod(a.se, b.se), comp_prod(a.sf, b.sf)))));
+}
diff --git a/src/silx/resources/opencl/linalg.cl b/src/silx/resources/opencl/linalg.cl
new file mode 100644
index 0000000..8710528
--- /dev/null
+++ b/src/silx/resources/opencl/linalg.cl
@@ -0,0 +1,88 @@
+/*
+ *   Copyright (C) 2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+/**
+ *
+ * Compute the spatial gradient of an image.
+ *
+ * slice: input image
+ * slice_grad: output gradient
+ * sizeX: number of columns of the image
+ * sizeY: number of rows of the image
+ *
+ **/
+kernel void kern_gradient2D(
+    global float* slice,
+    global float2* slice_grad,
+    int sizeX,
+    int sizeY)
+{
+
+    int gidx = (int) get_global_id(0);
+    int gidy = (int) get_global_id(1);
+
+    if ((gidx < sizeX) && (gidy < sizeY))
+    {
+    	// Note the direction inconstancy ! (JK 07/2018)
+
+        float val_y = (gidx == (sizeX-1))? 0: slice[gidy*sizeX+gidx+1] - slice[gidy*sizeX+gidx];
+        float val_x = (gidy == (sizeY-1))? 0: slice[(gidy+1)*sizeX+gidx] - slice[(gidy)*sizeX+gidx];
+
+        slice_grad[gidy*sizeX+gidx].x = val_x;
+        slice_grad[gidy*sizeX+gidx].y = val_y;
+    }
+}
+
+/**
+ *
+ * Compute the spatial divergence of an image gradient.
+ *
+ * slice_grad: input gradient-like image
+ * slice: output image
+ * sizeX: number of columns of the input
+ * sizeY: number of rows of the input
+ *
+ **/
+kernel void kern_divergence2D(
+    global float2* slice_grad,
+    global float* slice,
+    int sizeX,
+    int sizeY)
+{
+    int gidx = (int) get_global_id(0);
+    int gidy = (int) get_global_id(1);
+
+    if (gidx < sizeX && gidy < sizeY)
+    {
+        float val_x, val_y;
+        val_y = (gidx == 0)?
+        		slice_grad[(gidy)*sizeX+gidx].y :
+        		slice_grad[(gidy)*sizeX+gidx].y - slice_grad[(gidy)*sizeX+gidx-1].y;
+        val_x = (gidy == 0)?
+        		slice_grad[(gidy)*sizeX+gidx].x:
+                slice_grad[(gidy)*sizeX+gidx].x - slice_grad[(gidy-1)*sizeX+gidx].x;
+        slice[gidy*sizeX+gidx] = val_x + val_y;
+    }
+}
diff --git a/src/silx/resources/opencl/medfilt.cl b/src/silx/resources/opencl/medfilt.cl
new file mode 100644
index 0000000..0680230
--- /dev/null
+++ b/src/silx/resources/opencl/medfilt.cl
@@ -0,0 +1,141 @@
+/*
+ *   Project: Azimuthal regroupping OpenCL kernel for PyFAI.
+ *            Median filter for 1D, 2D and 3D datasets, only 2D for now
+ *
+ *
+ *   Copyright (C) 2017-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *   Last revision: 07/02/2017
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+/*
+ * Needs to be concatenated with bitonic.cl prior to compilation
+*/
+
+/*
+ *  Perform the 2D median filtering of an image 2D image
+ *
+ * dim0 => wg=number_of_element in the tile /8
+ * dim1 = x: wg=1
+
+ *
+ * Actually the workgoup size is a bit more complicated:
+ * if window size = 1,3,5,7: WG1=8
+ * if window size = 9,11,13,15: WG1=32
+ * if window size = 17, ...,21: WG1=64
+ *
+ * More Generally the workgroup size must be: at least: kfs2*(kfs1+7)/8
+ * Each thread treats 8 values aligned vertically, this allows (almost)
+ * coalesced reading between threads in one line of the tile.
+ *
+ * Later on it will be more efficient to re-use the same tile
+ * and slide it vertically by one line.
+ * The additionnal need for shared memory will be kfs2 floats and a float8 as register.
+ *
+ * Theoritically, it should be possible to handle up to windows-size 83x83
+ */
+__kernel void medfilt2d(__global float *image,  // input image
+                        __global float *result, // output array
+                        __local  float4 *l_data,// local storage 4x the number of threads
+                                 int khs1,      // Kernel half-size along dim1 (nb lines)
+                                 int khs2,      // Kernel half-size along dim2 (nb columns)
+                                 int height,    // Image size along dim1 (nb lines)
+                                 int width)     // Image size along dim2 (nb columns)
+{
+    int threadid = get_local_id(0);
+    //int wg = get_local_size(0);
+    int x = get_global_id(1);
+
+    if (x < width)
+    {
+        union
+        {
+            float  ary[8];
+            float8 vec;
+        } output, input;
+        input.vec = (float8)(MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT);
+        int kfs1 = 2 * khs1 + 1; //definition of kernel full size
+        int kfs2 = 2 * khs2 + 1;
+        int nbands = (kfs1 + 7) / 8; // 8 elements per thread, aligned vertically in 1 column
+        for (int y=0; y<height; y++)
+        {
+            //Select only the active threads, some may remain inactive
+            int nb_threads =  (nbands * kfs2);
+            int band_nr = threadid / kfs2;
+            int band_id = threadid % kfs2;
+            int pos_x = clamp((int)(x + band_id - khs2), (int) 0, (int) width-1);
+            int max_vec = clamp(kfs1 - 8 * band_nr, 0, 8);
+            if (y == 0)
+            {
+                for (int i=0; i<max_vec; i++)
+                {
+                    if (threadid<nb_threads)
+                    {
+                        int pos_y = clamp((int)(y + 8 * band_nr + i - khs1), (int) 0, (int) height-1);
+                        input.ary[i] = image[pos_x + width * pos_y];
+                    }
+                }
+            }
+            else
+            {
+                //store storage.s0 to some shared memory to retrieve it from another thread.
+                l_data[threadid].s0 = input.vec.s0;
+
+                //Offset to the bottom
+                input.vec = (float8)(input.vec.s1,
+                        input.vec.s2,
+                        input.vec.s3,
+                        input.vec.s4,
+                        input.vec.s5,
+                        input.vec.s6,
+                        input.vec.s7,
+                        MAXFLOAT);
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                int read_from = threadid + kfs2;
+                if (read_from < nb_threads)
+                    input.vec.s7 = l_data[read_from].s0;
+                else if (threadid < nb_threads) //we are on the last band
+                {
+                    int pos_y = clamp((int)(y + 8 * band_nr + max_vec - 1 - khs1), (int) 0, (int) height-1);
+                    input.ary[max_vec - 1] = image[pos_x + width * pos_y];
+                }
+
+            }
+
+            //This function is defined in bitonic.cl
+            output.vec = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
+                                       input.vec, l_data);
+
+            size_t target = (kfs1 * kfs2) / 2;
+            if (threadid == (target / 8))
+            {
+                result[y * width + x] = output.ary[target % 8];
+            }
+
+        }
+    }
+}
+
diff --git a/src/silx/resources/opencl/preprocess.cl b/src/silx/resources/opencl/preprocess.cl
new file mode 100644
index 0000000..de35c48
--- /dev/null
+++ b/src/silx/resources/opencl/preprocess.cl
@@ -0,0 +1,567 @@
+/*
+ *   Project: Azimuthal regroupping OpenCL kernel for PyFAI.
+ *            Preprocessing program
+ *
+ *
+ *   Copyright (C) 2012-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *   Last revision: 19/01/2017
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/**
+ * \file
+ *
+ * \brief OpenCL kernels for image array casting, array mem-setting and normalizing
+ *
+ * Constant to be provided at build time:
+ *   NIMAGE: size of the image
+ *
+ */
+
+#include "for_eclipse.h"
+
+/**
+ * \brief cast values of an array of int8 into a float output array.
+ *
+ * - array_s8: Pointer to global memory with the input data as signed8 array
+ * - array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+s8_to_float(__global char  *array_s8,
+            __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)array_s8[i];
+}
+
+
+/**
+ * \brief cast values of an array of uint8 into a float output array.
+ *
+ * - array_u8: Pointer to global memory with the input data as unsigned8 array
+ * - array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+u8_to_float(__global unsigned char  *array_u8,
+            __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)array_u8[i];
+}
+
+
+/**
+ * \brief cast values of an array of int16 into a float output array.
+ *
+ * - array_s16: Pointer to global memory with the input data as signed16 array
+ * - array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+s16_to_float(__global short *array_s16,
+             __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)array_s16[i];
+}
+
+
+/**
+ * \brief cast values of an array of uint16 into a float output array.
+ *
+ * - array_u16: Pointer to global memory with the input data as unsigned16 array
+ * - array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+u16_to_float(__global unsigned short  *array_u16,
+             __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)array_u16[i];
+}
+
+/**
+ * \brief cast values of an array of uint32 into a float output array.
+ *
+ * - array_u32: Pointer to global memory with the input data as unsigned32 array
+ * - array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+u32_to_float(__global unsigned int  *array_u32,
+             __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)array_u32[i];
+}
+
+/**
+ * \brief convert values of an array of int32 into a float output array.
+ *
+ * - array_int:  Pointer to global memory with the data as unsigned32 array
+ * - array_float:  Pointer to global memory with the data float array
+ */
+__kernel void
+s32_to_float(__global int  *array_int,
+             __global float  *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)(array_int[i]);
+}
+
+
+/**
+ * Functions to be called from an actual kernel.
+ * \brief Performs the normalization of input image by dark subtraction,
+ *        variance is propagated to second member of the float3
+ *        flatfield, solid angle, polarization and absorption are stored in
+ *        third member of the float3 returned.
+ *
+ * Invalid/Dummy pixels will all have the third-member set to 0, i.e. no weight
+ *
+ * - image           Float pointer to global memory storing the input image.
+ * - do_dark         Bool/int: shall dark-current correction be applied ?
+ * - dark            Float pointer to global memory storing the dark image.
+ * - do_flat         Bool/int: shall flat-field correction be applied ?
+ * - flat            Float pointer to global memory storing the flat image.
+ * - do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall polarization correction be applied ?
+ * - polarization    Float pointer to global memory storing the polarization of each pixel.
+ * - do_absorption   Bool/int: shall absorption correction be applied ?
+ * - absorption      Float pointer to global memory storing the effective absoption of each pixel.
+ * - do_mask         perform mask correction ?
+ * - mask            Bool/char pointer to mask array
+ * - do_dummy        Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy           Float: value for bad pixels
+ * - delta_dummy     Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+**/
+
+static float3 _preproc3(const __global float  *image,
+                        const __global float  *variance,
+                        const          char   do_dark,
+                        const __global float  *dark,
+                        const          char   do_dark_variance,
+                        const __global float  *dark_variance,
+                        const          char   do_flat,
+                        const __global float  *flat,
+                        const          char   do_solidangle,
+                        const __global float  *solidangle,
+                        const          char   do_polarization,
+                        const __global float  *polarization,
+                        const          char   do_absorption,
+                        const __global float  *absorption,
+                        const          char   do_mask,
+                        const __global char   *mask,
+                        const          char   do_dummy,
+                        const          float  dummy,
+                        const          float  delta_dummy,
+                        const          float  normalization_factor)
+{
+    size_t i= get_global_id(0);
+    float3 result = (float3)(0.0, 0.0, 0.0);
+    if (i < NIMAGE)
+    {
+        if ((!do_mask) || (!mask[i]))
+        {
+            result.s0 = image[i];
+            if (variance != 0)
+                result.s1 = variance[i];
+            result.s2 = normalization_factor;
+            if ( (!do_dummy)
+                  ||((delta_dummy != 0.0f) && (fabs(result.s0-dummy) > delta_dummy))
+                  ||((delta_dummy == 0.0f) && (result.s0 != dummy)))
+            {
+                if (do_dark)
+                    result.s0 -= dark[i];
+                if (do_dark_variance)
+                    result.s1 += dark_variance[i];
+                if (do_flat)
+                {
+                    float one_flat = flat[i];
+                    if ( (!do_dummy)
+                         ||((delta_dummy != 0.0f) && (fabs(one_flat-dummy) > delta_dummy))
+                         ||((delta_dummy == 0.0f) && (one_flat != dummy)))
+                        result.s2 *= one_flat;
+                    else
+                        result.s2 = 0.0f;
+                }
+                if (do_solidangle)
+                    result.s2 *= solidangle[i];
+                if (do_polarization)
+                    result.s2 *= polarization[i];
+                if (do_absorption)
+                    result.s2 *= absorption[i];
+                if (isnan(result.s0) || isnan(result.s1) || isnan(result.s2) || (result.s2 == 0.0f))
+                    result = (float3)(0.0, 0.0, 0.0);
+            }
+            else
+            {
+                result = (float3)(0.0, 0.0, 0.0);
+            }//end if do_dummy
+        } // end if mask
+    };//end if NIMAGE
+    return result;
+};//end function
+
+
+/**
+ * \brief Performs the normalization of input image by dark subtraction,
+ *        flatfield, solid angle, polarization and absorption division.
+ *
+ * Intensities of images are corrected by:
+ *  - dark (read-out) noise subtraction
+ *  - Solid angle correction (division)
+ *  - polarization correction (division)
+ *  - flat fiels correction (division)
+ * Corrections are made in place unless the pixel is dummy.
+ * Dummy pixels are left untouched so that they remain dummy
+ *
+ * - image           Float pointer to global memory storing the input image.
+ * - do_dark         Bool/int: shall dark-current correction be applied ?
+ * - dark            Float pointer to global memory storing the dark image.
+ * - do_flat         Bool/int: shall flat-field correction be applied ?
+ * - flat            Float pointer to global memory storing the flat image.
+ * - do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall polarization correction be applied ?
+ * - polarization    Float pointer to global memory storing the polarization of each pixel.
+ * - do_absorption   Bool/int: shall absorption correction be applied ?
+ * - absorption      Float pointer to global memory storing the effective absoption of each pixel.
+ * - do_mask         perform mask correction ?
+ * - mask            Bool/char pointer to mask array
+ * - do_dummy        Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy           Float: value for bad pixels
+ * - delta_dummy     Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+**/
+
+__kernel void
+corrections(const __global float  *image,
+            const          char   do_dark,
+            const __global float  *dark,
+            const          char   do_flat,
+            const __global float  *flat,
+            const          char   do_solidangle,
+            const __global float  *solidangle,
+            const          char   do_polarization,
+            const __global float  *polarization,
+			const          char   do_absorption,
+			const __global float  *absorption,
+            const          char   do_mask,
+            const __global char   *mask,
+            const          char   do_dummy,
+            const          float  dummy,
+            const          float  delta_dummy,
+            const          float  normalization_factor,
+                  __global float  *output
+            )
+{
+    size_t i= get_global_id(0);
+    float3 result = (float3)(0.0, 0.0, 0.0);
+    if (i < NIMAGE)
+    {
+        result = _preproc3(image,
+                            0,
+                            do_dark,
+                            dark,
+                            0,
+                            0,
+                            do_flat,
+                            flat,
+                            do_solidangle,
+                            solidangle,
+                            do_polarization,
+                            polarization,
+                            do_absorption,
+                            absorption,
+                            do_mask,
+                            mask,
+                            do_dummy,
+                            dummy,
+                            delta_dummy,
+                            normalization_factor);
+        if (result.s2 != 0.0f)
+            output[i] = result.s0 / result.s2;
+        else
+            output[i] = dummy;
+    };//end if NIMAGE
+
+};//end kernel
+
+
+/**
+ * \brief Performs Normalization of input image with float2 output (num,denom)
+ *
+ * Intensities of images are corrected by:
+ *  - dark (read-out) noise subtraction for the data
+ *  - Solid angle correction (denominator)
+ *  - polarization correction (denominator)
+ *  - flat fiels correction (denominator)
+ *
+ * Corrections are made out of place.
+ * Dummy pixels set both the numerator and denominator to 0
+ *
+ * - image           Float pointer to global memory storing the input image.
+ * - do_dark         Bool/int: shall dark-current correction be applied ?
+ * - dark            Float pointer to global memory storing the dark image.
+ * - do_flat         Bool/int: shall flat-field correction be applied ?
+ * - flat            Float pointer to global memory storing the flat image.
+ * - do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall flat-field correction be applied ?
+ * - polarization    Float pointer to global memory storing the polarization of each pixel.
+ * - do_dummy          Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy             Float: value for bad pixels
+ * - delta_dummy       Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+ *
+**/
+__kernel void
+corrections2(const __global float  *image,
+             const          char   do_dark,
+             const __global float  *dark,
+             const          char   do_flat,
+             const __global float  *flat,
+             const          char   do_solidangle,
+             const __global float  *solidangle,
+             const          char   do_polarization,
+             const __global float  *polarization,
+             const          char   do_absorption,
+             const __global float  *absorption,
+             const          char   do_mask,
+             const __global char   *mask,
+             const          char   do_dummy,
+             const          float  dummy,
+             const          float  delta_dummy,
+             const          float  normalization_factor,
+                   __global float2  *output
+            )
+{
+    size_t i= get_global_id(0);
+    float3 result = (float3)(0.0, 0.0, 0.0);
+    if (i < NIMAGE)
+    {
+        result = _preproc3(image,
+                            0,
+                            do_dark,
+                            dark,
+                            0,
+                            0,
+                            do_flat,
+                            flat,
+                            do_solidangle,
+                            solidangle,
+                            do_polarization,
+                            polarization,
+                            do_absorption,
+                            absorption,
+                            do_mask,
+                            mask,
+                            do_dummy,
+                            dummy,
+                            delta_dummy,
+                            normalization_factor);
+        output[i] = (float2)(result.s0, result.s2);
+    };//end if NIMAGE
+};//end kernel
+
+/**
+ * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) assuming poissonian signal
+ *
+ * Intensities of images are corrected by:
+ *  - dark (read-out) noise subtraction for the data
+ *  - Solid angle correction (denominator)
+ *  - polarization correction (denominator)
+ *  - flat fiels correction (denominator)
+ *
+ * Corrections are made out of place.
+ * Dummy pixels set both the numerator and denominator to 0
+ *
+ * - image           Float pointer to global memory storing the input image.
+ * - do_dark         Bool/int: shall dark-current correction be applied ?
+ * - dark            Float pointer to global memory storing the dark image.
+ * - do_flat         Bool/int: shall flat-field correction be applied ?
+ * - flat            Float pointer to global memory storing the flat image.
+ * - do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall flat-field correction be applied ?
+ * - polarization    Float pointer to global memory storing the polarization of each pixel.
+ * - do_dummy          Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy             Float: value for bad pixels
+ * - delta_dummy       Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+ *
+**/
+__kernel void
+corrections3Poisson( const __global float  *image,
+                     const          char   do_dark,
+                     const __global float  *dark,
+                     const          char   do_flat,
+                     const __global float  *flat,
+                     const          char   do_solidangle,
+                     const __global float  *solidangle,
+                     const          char   do_polarization,
+                     const __global float  *polarization,
+                     const          char   do_absorption,
+                     const __global float  *absorption,
+                     const          char   do_mask,
+                     const __global char   *mask,
+                     const          char   do_dummy,
+                     const          float  dummy,
+                     const          float  delta_dummy,
+                     const          float  normalization_factor,
+                           __global float3  *output
+            )
+{
+    size_t i= get_global_id(0);
+    float3 result = (float3)(0.0, 0.0, 0.0);
+    if (i < NIMAGE)
+    {
+        result = _preproc3(image,
+                           image,
+                           do_dark,
+                           dark,
+                           do_dark,
+                           dark,
+                           do_flat,
+                           flat,
+                           do_solidangle,
+                           solidangle,
+                           do_polarization,
+                           polarization,
+                           do_absorption,
+                           absorption,
+                           do_mask,
+                           mask,
+                           do_dummy,
+                           dummy,
+                           delta_dummy,
+                           normalization_factor);
+        output[i] = result;
+    };//end if NIMAGE
+};//end kernel
+
+
+/**
+ * \brief Performs Normalization of input image with float3 output (signal, variance, normalization)
+ *
+ * Intensities of images are corrected by:
+ *  - dark (read-out) noise subtraction for the data
+ *  - Solid angle correction (division)
+ *  - polarization correction (division)
+ *  - flat fiels correction (division)
+ * Corrections are made in place unless the pixel is dummy.
+ * Dummy pixels are left untouched so that they remain dummy
+ *
+ * - image              Float pointer to global memory storing the input image.
+ * - do_dark         Bool/int: shall dark-current correction be applied ?
+ * - dark            Float pointer to global memory storing the dark image.
+ * - do_flat         Bool/int: shall flat-field correction be applied ?
+ * - flat            Float pointer to global memory storing the flat image.
+ * - do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall flat-field correction be applied ?
+ * - polarization    Float pointer to global memory storing the polarization of each pixel.
+ * - do_dummy          Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy             Float: value for bad pixels
+ * - delta_dummy       Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+ *
+**/
+
+__kernel void
+corrections3(const __global float  *image,
+             const __global float  *variance,
+             const          char   do_dark,
+             const __global float  *dark,
+             const          char   do_dark_variance,
+             const __global float  *dark_variance,
+             const          char   do_flat,
+             const __global float  *flat,
+             const          char   do_solidangle,
+             const __global float  *solidangle,
+             const          char   do_polarization,
+             const __global float  *polarization,
+             const          char   do_absorption,
+             const __global float  *absorption,
+             const          char   do_mask,
+             const __global char   *mask,
+             const          char   do_dummy,
+             const          float  dummy,
+             const          float  delta_dummy,
+             const          float  normalization_factor,
+                   __global float3  *output
+            )
+{
+    size_t i= get_global_id(0);
+    float3 result = (float3)(0.0, 0.0, 0.0);
+    if (i < NIMAGE)
+    {
+        result = _preproc3( image,
+                            variance,
+                            do_dark,
+                            dark,
+                            do_dark_variance,
+                            dark_variance,
+                            do_flat,
+                            flat,
+                            do_solidangle,
+                            solidangle,
+                            do_polarization,
+                            polarization,
+                            do_absorption,
+                            absorption,
+                            do_mask,
+                            mask,
+                            do_dummy,
+                            dummy,
+                            delta_dummy,
+                            normalization_factor);
+        output[i] = result;
+    };//end if NIMAGE
+};//end kernel
+
+
diff --git a/src/silx/resources/opencl/proj.cl b/src/silx/resources/opencl/proj.cl
new file mode 100644
index 0000000..2a6d870
--- /dev/null
+++ b/src/silx/resources/opencl/proj.cl
@@ -0,0 +1,345 @@
+/*
+ *   Copyright (C) 2017-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ * Based on the projector of PyHST2 - https://forge.epn-campus.eu/projects/pyhst2
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+/*******************************************************************************/
+/************************ GPU VERSION (with textures) **************************/
+/*******************************************************************************/
+
+#ifndef DONT_USE_TEXTURES
+kernel void  forward_kernel(
+        global float *d_Sino,
+        read_only image2d_t d_slice,
+        int dimslice,
+        int num_bins,
+        global float* angles_per_project ,
+        float axis_position,
+        global float *d_axis_corrections,
+        global int* d_beginPos    ,
+        global int* d_strideJoseph,
+        global int* d_strideLine  ,
+        int num_projections,
+        int  dimrecx,
+        int  dimrecy,
+        float cpu_offset_x,
+        float cpu_offset_y,
+        int josephnoclip,
+        int normalize)
+{
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+    const int tidx = get_local_id(0);
+    const int bidx = get_group_id(0);
+    const int tidy = get_local_id(1);
+    const int bidy = get_group_id(1);
+    float angle;
+    float cos_angle,sin_angle ;
+
+    local float corrections[16];
+    local int beginPos[16*2];
+    local int strideJoseph[16*2];
+    local int strideLine[16*2];
+
+    // thread will use corrections[tidy]
+    // All are read by first warp
+    int offset, OFFSET;
+    switch(tidy) {
+    case 0:
+        corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx];
+        break;
+    case 1:
+    case 2:
+        offset = 16*(tidy-1);
+        OFFSET = dimrecy*(tidy-1);
+        beginPos    [offset + tidx ]=  d_beginPos[ OFFSET+ bidy*16+tidx]  ;
+        break;
+    case 3:
+    case 4:
+        offset = 16*(tidy-3);
+        OFFSET = dimrecy*(tidy-3);
+        strideJoseph[offset + tidx ]=  d_strideJoseph[OFFSET + bidy*16+tidx]  ;
+        break;
+    case 5:
+    case 6:
+        offset = 16*(tidy-5);
+        OFFSET = dimrecy*(tidy-5);
+        strideLine[offset + tidx ]=  d_strideLine[OFFSET + bidy*16+tidx]  ;
+        break;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    angle = angles_per_project[ bidy*16+tidy ]   ;
+    cos_angle = cos(angle);
+    sin_angle = sin(angle);
+
+    if(fabs(cos_angle) > 0.70710678f ) {
+        if( cos_angle>0) {
+            cos_angle = cos(angle);
+            sin_angle = sin(angle);
+        }
+        else {
+            cos_angle = -cos(angle);
+            sin_angle = -sin(angle);
+        }
+    }
+    else {
+        if( sin_angle>0) {
+            cos_angle =  sin(angle);
+            sin_angle = -cos(angle);
+        }
+        else {
+            cos_angle = -sin(angle);
+            sin_angle =  cos(angle);
+        }
+    }
+    float res=0.0f;
+    float axis_corr = axis_position  + corrections[ tidy ];
+    float axis      = axis_position ;
+    float xpix = ( bidx*16+tidx )-cpu_offset_x;
+    float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ;
+
+    float shiftJ = sin_angle/cos_angle;
+    float x1 = min(-sin_angle/cos_angle ,0.f);
+    float x2 = max(-sin_angle/cos_angle ,0.f);
+
+    float  Area;
+    Area=1.0f/cos_angle;
+    int stlA, stlB , stlAJ, stlBJ ;
+    stlA=strideLine[16+tidy];
+    stlB=strideLine[tidy];
+    stlAJ=strideJoseph[16+tidy];
+    stlBJ=strideJoseph[tidy];
+
+    int beginA = beginPos[16+tidy ];
+    int beginB = beginPos[tidy ];
+    float add;
+    int l;
+
+    if(josephnoclip) {
+        for(int j=0; j<dimslice; j++) {  // j: Joseph
+            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
+            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
+            add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2);
+            res += add;
+            posx += shiftJ;
+        }
+    }
+    else {
+        for(int j=0; j<dimslice; j++) {  // j: Joseph
+            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
+            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
+            l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ;
+            add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2);
+            res += add*l;
+            posx += shiftJ;
+        }
+    }
+
+    if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) {
+        res *= Area;
+        if (normalize)
+            res *= M_PI_F * 0.5f / num_projections;
+        d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res;
+    }
+}
+#endif
+
+
+/*******************************************************************************/
+/********************* CPU VERSION (without textures) **************************/
+/*******************************************************************************/
+
+
+kernel void  forward_kernel_cpu(
+        global float *d_Sino,
+        global float* d_slice,
+        int dimslice,
+        int num_bins,
+        global float* angles_per_project ,
+        float axis_position,
+        global float *d_axis_corrections,
+        global int* d_beginPos    ,
+        global int* d_strideJoseph,
+        global int* d_strideLine  ,
+        int num_projections,
+        int  dimrecx,
+        int  dimrecy,
+        float cpu_offset_x,
+        float cpu_offset_y,
+        int josephnoclip,
+        int normalize)
+{
+
+    const int tidx = get_local_id(0);
+    const int bidx = get_group_id(0);
+    const int tidy = get_local_id(1);
+    const int bidy = get_group_id(1);
+    float angle;
+    float cos_angle,sin_angle ;
+
+    local float corrections[16];
+    local int beginPos[16*2];
+    local int strideJoseph[16*2];
+    local int strideLine[16*2];
+
+    // thread will use corrections[tidy]
+    // All are read by first warp
+    int offset, OFFSET;
+    switch(tidy) {
+    case 0:
+        corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx];
+        break;
+    case 1:
+    case 2:
+        offset = 16*(tidy-1);
+        OFFSET = dimrecy*(tidy-1);
+        beginPos    [offset + tidx ]=  d_beginPos[ OFFSET+ bidy*16+tidx]  ;
+        break;
+    case 3:
+    case 4:
+        offset = 16*(tidy-3);
+        OFFSET = dimrecy*(tidy-3);
+        strideJoseph[offset + tidx ]=  d_strideJoseph[OFFSET + bidy*16+tidx]  ;
+        break;
+    case 5:
+    case 6:
+        offset = 16*(tidy-5);
+        OFFSET = dimrecy*(tidy-5);
+        strideLine[offset + tidx ]=  d_strideLine[OFFSET + bidy*16+tidx]  ;
+        break;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    angle = angles_per_project[ bidy*16+tidy ]   ;
+    cos_angle = cos(angle);
+    sin_angle = sin(angle);
+
+    if(fabs(cos_angle) > 0.70710678f ) {
+        if( cos_angle>0) {
+            cos_angle = cos(angle);
+            sin_angle = sin(angle);
+        }
+        else {
+            cos_angle = -cos(angle);
+            sin_angle = -sin(angle);
+        }
+    }
+    else {
+        if( sin_angle>0) {
+            cos_angle =  sin(angle);
+            sin_angle = -cos(angle);
+        }
+        else {
+            cos_angle = -sin(angle);
+            sin_angle =  cos(angle);
+        }
+    }
+    float res=0.0f;
+    float axis_corr = axis_position  + corrections[ tidy ];
+    float axis      = axis_position ;
+    float xpix = ( bidx*16+tidx )-cpu_offset_x;
+    float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ;
+
+    float shiftJ = sin_angle/cos_angle;
+    float x1 = min(-sin_angle/cos_angle ,0.f);
+    float x2 = max(-sin_angle/cos_angle ,0.f);
+
+    float  Area;
+    Area=1.0f/cos_angle;
+    int stlA, stlB , stlAJ, stlBJ ;
+    stlA=strideLine[16+tidy];
+    stlB=strideLine[tidy];
+    stlAJ=strideJoseph[16+tidy];
+    stlBJ=strideJoseph[tidy];
+
+    int beginA = beginPos[16+tidy ];
+    int beginB = beginPos[tidy ];
+    int l;
+
+    int ym, yp, xm, xp;
+    float yc, xc;
+    float val;
+    if(josephnoclip) {
+        for(int j=0; j<dimslice; j++) {  // j: Joseph
+            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.0f;
+            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.0f;
+            /*
+              Bilinear interpolation
+            */
+            yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped
+            ym = (int) floor(yc); // y_minus
+            yp = (int) ceil(yc);  // y_plus
+
+            xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f));  // x_clipped
+            xm = (int) floor(xc); // x_minus
+            xp = (int) ceil(xc);  // x_plus
+
+            if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm];
+            else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm));
+            else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym));
+            else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc))
+                       + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc))
+                       + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm))
+                       + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm));
+            // ----------
+            res += val;
+            posx += shiftJ;
+        }
+    }
+    else {
+        for(int j=0; j<dimslice; j++) {  // j: Joseph
+            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
+            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
+            l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ;
+            /*
+              Bilinear interpolation
+            */
+            yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped
+            ym = (int) floor(yc); // y_minus
+            yp = (int) ceil(yc);  // y_plus
+
+            xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f));  // x_clipped
+            xm = (int) floor(xc); // x_minus
+            xp = (int) ceil(xc);  // x_plus
+
+            if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm];
+            else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm));
+            else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym));
+            else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc))
+                       + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc))
+                       + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm))
+                       + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm));
+            // ----------
+            res += val*l;
+            posx += shiftJ;
+        }
+    }
+
+    if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) {
+        res *= Area;
+        if (normalize)
+            res *= M_PI_F * 0.5f / num_projections;
+        d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res;
+    }
+}
diff --git a/src/silx/resources/opencl/sparse.cl b/src/silx/resources/opencl/sparse.cl
new file mode 100644
index 0000000..c99a0e9
--- /dev/null
+++ b/src/silx/resources/opencl/sparse.cl
@@ -0,0 +1,94 @@
+/*
+ *   Copyright (C) 2016-2019 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef IMAGE_WIDTH
+  #error "Please define IMAGE_WIDTH parameter"
+#endif
+
+#ifndef DTYPE
+  #error "Please define DTYPE parameter"
+#endif
+
+#ifndef IDX_DTYPE
+  #error "Please define IDX_DTYPE parameter"
+#endif
+
+
+
+/**
+ * Densify a matric from CSR format to "dense" 2D format.
+ * The input CSR data consists in 3 arrays: (data, ind, iptr).
+ * The output array is a 2D array of dimensions IMAGE_WIDTH * image_height.
+ *
+ * data: 1D array containing the nonzero data items.
+ * ind: 1D array containing the column indices of the nonzero data items.
+ * iptr: 1D array containing indirection indices, such that range
+ *    [iptr[i], iptr[i+1]-1] of "data" and "ind" contain the relevant data
+ *    of output row "i".
+ * output: 2D array containing the densified data.
+ * image_height: height (number of rows) of the output data.
+**/
+
+kernel void densify_csr(
+    const global DTYPE* data,
+    const global IDX_DTYPE* ind,
+    const global IDX_DTYPE* iptr,
+    global DTYPE* output,
+    int image_height
+)
+{
+    uint tid = get_local_id(0);
+    uint row_idx = get_global_id(1);
+    if ((tid >= IMAGE_WIDTH) || (row_idx >= image_height)) return;
+
+    local DTYPE line[IMAGE_WIDTH];
+
+    // Memset
+    //~ #pragma unroll
+    for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) {
+        if (tid+k >= IMAGE_WIDTH) break;
+        line[tid+k] = 0.0f;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+
+    uint start = iptr[row_idx], end = iptr[row_idx+1];
+    //~ #pragma unroll
+    for (int k = start; k < end; k += get_local_size(0)) {
+        // Current work group handles one line of the final array
+        // on the current line, write data[start+tid] at column index ind[start+tid]
+        if (k+tid < end)
+            line[ind[k+tid]] = data[k+tid];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // write the current line (shared mem) into the output array (global mem)
+    //~ #pragma unroll
+    for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) {
+        output[row_idx*IMAGE_WIDTH + tid+k] = line[tid+k];
+        if (k+tid >= IMAGE_WIDTH) return;
+    }
+}
diff --git a/src/silx/resources/opencl/statistics.cl b/src/silx/resources/opencl/statistics.cl
new file mode 100644
index 0000000..47d925b
--- /dev/null
+++ b/src/silx/resources/opencl/statistics.cl
@@ -0,0 +1,283 @@
+/*
+ *   Project: Silx statics calculation
+ *
+ *
+ *
+ *   Copyright (C) 2012-2021 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *   Last revision: 17/05/2021
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/**
+ * \file
+ *
+ * \brief OpenCL kernels for min, max, mean and std calculation
+ *
+ * This module provides two functions to perform the `map` and the `reduce` 
+ * to be used with pyopencl reduction to calculate in a single pass the minimum, 
+ * maximum, sum, count, mean and standart deviation for an array.
+ * 
+ * So beside the reduction mechanisme from pyopencl, this algorithm implementes equations from
+ * https://dbs.ifi.uni-heidelberg.de/files/Team/eschubert/publications/SSDBM18-covariance-authorcopy.pdf
+ *
+ * let A and B be 2 disjoint partition of all elements
+ * 
+ * Omega_A = sum_{i \in A}(omaga_i) The sum of all weights
+ * V_A is the weighted sum of the signal over the partition
+ * VV_A is the weighted sum of deviation squarred
+ * 
+ * With this the mean is V / Omega and the variance equals VV / omega.
+ * 
+ * Redction operator performs:
+ * Omega_{AB} = Omega_A + Omega_B
+ * V_{AB} = V_A + V_B
+ * VV{AB} = VV_A + VV_B + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB})
+ *
+ * To avoid any numerical degradation, the doubleword library is used to perform all floating point operations. 
+ *
+ */
+#include "for_eclipse.h"
+
+/* \brief read a value at given position and initialize the float8 for the reduction
+ *
+ * The float8 returned contains:
+ * s0: minimum value
+ * s1: maximum value
+ * s2: Omega_h count number of valid pixels
+ * s3: Omega_l error associated to the count
+ * s4: V_h sum of signal
+ * s5: V_l error associated to the sum of signal
+ * s6: VVh variance*count
+ * s7: VVl error associated to variance*count
+ *
+ */
+static inline float8 map_statistics(global float* data, int position)
+{
+    float value = data[position];
+    float8 result;
+
+    if (isfinite(value))
+    {
+        result = (float8)(value, value, 1.0f, 0.0f, value, 0.0f, 0.0f, 0.0f);
+        //                min     max   cnt_h cnt_l sum_h  sum_l M2_h  M2_l
+    }
+    else
+    {
+        result = (float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
+        //                min        max     cnt_h cnt_l sum_h sum_l M2_h  M2_l
+    }
+    return result;
+}
+
+/* \brief reduction function associated to the statistics.
+ *
+ * this is described in:
+ * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+ *
+ * The float8 used here contain contains:
+ * s0: minimum value
+ * s1: maximum value
+ * s2: count number of valid pixels
+ * s3: count (error associated to)
+ * s4: sum of valid pixels
+ * s5: sum (error associated to)
+ * s6: M=variance*(count-1)
+ * s7: M=variance*(count-1) (error associated to)
+ *
+ */
+
+static inline float8 reduce_statistics(float8 a, float8 b)
+{
+    float2 sum_a, sum_b, M_a, M_b, count_a, count_b;
+    float2 count, sum;
+    float2 M, delta, delta2, omega3;
+    //test on count
+    if (a.s2 == 0.0f)
+    {
+        return b;
+    }
+    else
+    {
+        count_a = (float2)(a.s2, a.s3);
+        sum_a = (float2)(a.s4, a.s5);
+        M_a = (float2)(a.s6, a.s7);
+    }
+    //test on count
+    if (b.s2 == 0.0f)
+    {
+        return a;
+    }
+    else
+    {
+        count_b = (float2)(b.s2, b.s3);
+        sum_b = (float2)(b.s4, b.s5);
+        M_b = (float2)(b.s6, b.s7);
+    }
+    // count = count_a + count_b
+    count = dw_plus_dw(count_a, count_b);
+    // sum = sum_a + sum_b
+    sum = dw_plus_dw(sum_a, sum_b);
+    
+    // M2 = M_a + M_b + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB})
+    M =  dw_plus_dw(M_a, M_b);
+    delta = dw_plus_dw(dw_times_dw(count_b, sum_a),
+                      -dw_times_dw(count_a, sum_b));
+    delta2 = dw_times_dw(delta, delta);
+    omega3 = dw_times_dw(count, dw_times_dw(count_a, count_b));            
+    M = dw_plus_dw(M, dw_div_dw(delta2, omega3));
+                                    
+    float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1),
+                             count.s0,        count.s1,
+                             sum.s0,          sum.s1,
+                             M.s0,            M.s1);
+    return result;
+}
+
+/* \brief reduction function associated to the statistics without compensated arithmetics.
+ *
+ * this is described in:
+ * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+ *
+ * The float8 used here contain contains:
+ * s0: minimum value
+ * s1: maximum value
+ * s2: count number of valid pixels (major)
+ * s3: count number of valid pixels (minor)
+ * s4: sum of valid pixels (major)
+ * s5: sum of valid pixels (minor)
+ * s6: variance*count (major)
+ * s7: variance*count (minor)
+ *
+ */
+
+static inline float8 reduce_statistics_simple(float8 a, float8 b)
+{
+    float sum_a, sum_b, M_a, M_b, count_a, count_b;
+
+    //test on count
+    if (a.s2 == 0.0f)
+    {
+        return b;
+    }
+    else
+    {
+        count_a = a.s2;
+        sum_a = a.s4;
+        M_a = a.s6;
+    }
+    //test on count
+    if (b.s2 == 0.0f)
+    {
+        return a;
+    }
+    else
+    {
+        count_b = b.s2;
+        sum_b = b.s4;
+        M_b = b.s6;
+    }
+    float count = count_a + count_b;
+    float sum = sum_a + sum_b;
+    float delta = sum_a*count_b - sum_b*count_a;
+    float delta2 = delta * delta;
+    float M2 = M_a + M_b + delta2/(count*count_a*count_b);
+    float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1),
+                             count,        0.0f,
+                             sum,          0.0f,
+                             M2,           0.0f);
+    return result;
+}
+
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+// unpack a double in two floats such as the  sum of the two is the double number
+static inline float2 unpack_double(double inp){
+    float major = (float) inp;
+    float minor = (float) (inp - major);
+    return (float2)(major, minor);  
+}
+
+// pack two floats into a double
+static inline double pack_double(float major, float minor){
+    return (double)major + (double)minor;
+}
+
+/* \brief reduction function associated to the statistics using double precision arithmetics
+ *
+ * this is described in:
+ * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+ *
+ * The float8 used here contain contains:
+ * s0: minimum value
+ * s1: maximum value
+ * s2: count number of valid pixels (major)
+ * s3: count number of valid pixels (minor)
+ * s4: sum of valid pixels (major)
+ * s5: sum of valid pixels (minor)
+ * s6: variance*count (major)
+ * s7: variance*count (minor)
+ *
+ */
+
+static inline float8 reduce_statistics_double(float8 a, float8 b)
+{
+    double sum_a, sum_b, M_a, M_b, count_a, count_b;
+
+    //test on count
+    if (a.s2 == 0.0)
+    {
+        return b;
+    }
+    else
+    {
+        count_a = pack_double(a.s2, a.s3);
+        sum_a = pack_double(a.s4,a.s5);
+        M_a = pack_double(a.s6, a.s7);
+    }
+    //test on count
+    if (b.s2 == 0.0)
+    {
+        return a;
+    }
+    else
+    {
+        count_b = pack_double(b.s2, b.s3);
+        sum_b = pack_double(b.s4, b.s5);
+        M_b = pack_double(b.s6, b.s7);
+    }
+    double count = count_a + count_b;
+    double sum = sum_a + sum_b;
+    double delta = sum_a*count_b - sum_b*count_a;
+    double delta2 = delta * delta;
+    double M2 = M_a + M_b + delta2/(count*count_a*count_b);
+    float8 result = (float8)((float2)(min(a.s0, b.s0), max(a.s1, b.s1)),
+                                unpack_double(count),
+                                unpack_double( sum),
+                                unpack_double( M2));
+    return result;
+}
+
+#endif
+\ No newline at end of file