20 files changed, 0 insertions, 4332 deletions
diff --git a/silx/resources/opencl/addition.cl b/silx/resources/opencl/addition.cl
deleted file mode 100644
index 35d7996..0000000
--- a/silx/resources/opencl/addition.cl
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *   Project: SIFT: An algorithm for image alignement
- *
- *   Copyright (C) 2013-2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-// "Hello_world" kernel to test if OpenCL is actually working
-kernel void addition(global float* a,
-                     global float* b,
-                     global float* res,
-                     int N)
-{
-    int i = get_global_id(0);
-    if( i<N ){
-        res[i] = a[i] + b[i];
-    }
-}
diff --git a/silx/resources/opencl/array_utils.cl b/silx/resources/opencl/array_utils.cl
deleted file mode 100644
index 6f78921..0000000
--- a/silx/resources/opencl/array_utils.cl
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- *  2D Memcpy for float* arrays,
- * replacing pyopencl "enqueue_copy" which does not work for rectangular copies.
- * ALL THE SIZES/OFFSETS ARE SPECIFIED IN PIXELS, NOT IN BYTES.
- * In the (x, y) convention, x is the fast index (as in CUDA).
- *
- * :param dst: destination array
- * :param src: source array
- * :param dst_width: width of the dst array
- * :param src_width: width of the src array
- * :param dst_offset: tuple with the offset (x, y) in the dst array
- * :param src_offset: tuple with the offset (x, y) in the src array
- * :param transfer_shape: shape of the transfer array in the form (x, y)
- *
- */
-kernel void cpy2d(
-    global float* dst,
-    global float* src,
-    int dst_width,
-    int src_width,
-    int2 dst_offset,
-    int2 src_offset,
-    int2 transfer_shape)
-{
-    int gidx = get_global_id(0), gidy = get_global_id(1);
-    if (gidx < transfer_shape.x && gidy < transfer_shape.y) {
-        dst[(dst_offset.y + gidy)*dst_width + (dst_offset.x + gidx)] = src[(src_offset.y + gidy)*src_width + (src_offset.x + gidx)];
-    }
-}
-
-
-// Looks like cfloat_t and cfloat_mul are not working, yet specified in
-// pyopencl documentation. Here we are using float2 as in all available examples
-// #include <pyopencl-complex.h>
-// typedef cfloat_t complex;
-
-static inline float2 complex_mul(float2 a, float2 b) {
-    float2 res = (float2) (0, 0);
-    res.x = a.x * b.x - a.y * b.y;
-    res.y = a.y * b.x + a.x * b.y;
-    return res;
-}
-
-// arr2D *= arr1D (line by line, i.e along fast dim)
-kernel void inplace_complex_mul_2Dby1D(
-    global float2* arr2D,
-    global float2* arr1D,
-    int width,
-    int height)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    if ((x >= width) || (y >= height)) return;
-    int i = y*width + x;
-    arr2D[i] = complex_mul(arr2D[i], arr1D[x]);
-}
-
-
-// arr3D *= arr1D (along fast dim)
-kernel void inplace_complex_mul_3Dby1D(
-    global float2* arr3D,
-    global float2* arr1D,
-    int width,
-    int height,
-    int depth)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-    if ((x >= width) || (y >= height) || (z >= depth)) return;
-    int i = (z*height + y)*width + x;
-    arr3D[i] = complex_mul(arr3D[i], arr1D[x]);
-}
diff --git a/silx/resources/opencl/backproj.cl b/silx/resources/opencl/backproj.cl
deleted file mode 100644
index da15131..0000000
--- a/silx/resources/opencl/backproj.cl
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- *   Project: silx: filtered backprojection
- *
- *   Copyright (C) 2016-2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: A. Mirone
- *                      P. Paleo
- *
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-/*******************************************************************************/
-/************************ GPU VERSION (with textures) **************************/
-/*******************************************************************************/
-
-#ifndef DONT_USE_TEXTURES
-kernel void backproj_kernel(
-    int num_proj,
-    int num_bins,
-    float axis_position,
-    global float *d_SLICE,
-    read_only image2d_t d_sino,
-    float gpu_offset_x,
-    float gpu_offset_y,
-    global float * d_cos_s,  // precalculated cos(theta[i])
-    global float* d_sin_s,   // precalculated sin(theta[i])
-    global float* d_axis_s,  // array of axis positions (n_projs)
-    local float* shared2)    // 768B of local mem
-{
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
-    const int tidx = get_local_id(0); //threadIdx.x;
-    const int bidx = get_group_id(0); //blockIdx.x;
-    const int tidy = get_local_id(1); //threadIdx.y;
-    const int bidy = get_group_id(1); //blockIdx.y;
-
-    local float sh_cos[256];
-    local float sh_sin[256];
-    local float sh_axis[256];
-
-    float pcos, psin;
-    float h0, h1, h2, h3;
-    const float apos_off_x= gpu_offset_x - axis_position ;
-    const float apos_off_y= gpu_offset_y - axis_position ;
-    float acorr05;
-    float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
-
-    const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x  ) ;
-    const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y  ) ;
-
-    int read=0;
-    for(int proj=0; proj<num_proj; proj++) {
-        if(proj>=read) {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int ip = tidy*16+tidx;
-            if( read+ip < num_proj) {
-                sh_cos [ip] = d_cos_s[read+ip] ;
-                sh_sin [ip] = d_sin_s[read+ip] ;
-                sh_axis[ip] = d_axis_s[read+ip] ;
-            }
-            read=read+256; // 256=16*16 block size
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        pcos = sh_cos[256-read + proj] ;
-        psin = sh_sin[256-read + proj] ;
-
-        acorr05 = sh_axis[256 - read + proj] ;
-
-        h0 = (acorr05 + bx00*pcos - by00*psin);
-        h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
-        h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
-        h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
-
-        if(h0>=0 && h0<num_bins) res0 += read_imagef(d_sino, sampler, (float2) (h0 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h0 +0.5f,proj +0.5f);
-        if(h1>=0 && h1<num_bins) res1 += read_imagef(d_sino, sampler, (float2) (h1 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h1 +0.5f,proj +0.5f);
-        if(h2>=0 && h2<num_bins) res2 += read_imagef(d_sino, sampler, (float2) (h2 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h2 +0.5f,proj +0.5f);
-        if(h3>=0 && h3<num_bins) res3 += read_imagef(d_sino, sampler, (float2) (h3 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h3 +0.5f,proj +0.5f);
-    }
-    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
-    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
-    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
-    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
-}
-#endif
-
-
-
-
-/*******************************************************************************/
-/********************* CPU VERSION (without textures) **************************/
-/*******************************************************************************/
-
-
-#define CLIP_MAX(x, N) (fmin(fmax(x, 0.0f), (N - 1.0f)))
-
-#define FLOORCEIL_x(x) {\
-    xm = (int) floor(x);\
-    xp = (int) ceil(x);\
-}
-
-#define ADJACENT_PIXELS_VALS(arr, Nx, y, xm, xp) ((float2) (arr[y*Nx+xm], arr[y*Nx+xp]))
-
-//Simple linear interpolator for working on the GPU
-static float linear_interpolation(float2 vals,
-                                  float x,
-                                  int xm,
-                                  int xp)
-{
-    if (xm == xp)
-        return vals.s0;
-    else
-        return (vals.s0 * (xp - x)) + (vals.s1 * (x - xm));
-}
-
-/**
- *
- *  Same kernel as backproj_kernel, but targets the CPU (no texture)
- *
-**/
-kernel void backproj_cpu_kernel(
-    int num_proj,
-    int num_bins,
-    float axis_position,
-    global float *d_SLICE,
-    global float* d_sino,
-    float gpu_offset_x,
-    float gpu_offset_y,
-    global float * d_cos_s, // precalculated cos(theta[i])
-    global float * d_sin_s, // precalculated sin(theta[i])
-    global float * d_axis_s, // array of axis positions (n_projs)
-    local float* shared2)     // 768B of local mem
-{
-    const int tidx = get_local_id(0); //threadIdx.x;
-    const int bidx = get_group_id(0); //blockIdx.x;
-    const int tidy = get_local_id(1); //threadIdx.y;
-    const int bidy = get_group_id(1); //blockIdx.y;
-
-    local float sh_cos[256];
-    local float sh_sin[256];
-    local float sh_axis[256];
-
-    float pcos, psin;
-    float h0, h1, h2, h3;
-    const float apos_off_x= gpu_offset_x - axis_position ;
-    const float apos_off_y= gpu_offset_y - axis_position ;
-    float acorr05;
-    float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
-
-    const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x  ) ;
-    const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y  ) ;
-
-    int read=0;
-    for(int proj=0; proj<num_proj; proj++) {
-        if(proj>=read) {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int ip = tidy*16+tidx;
-            if( read+ip < num_proj) {
-                sh_cos [ip] = d_cos_s[read+ip] ;
-                sh_sin [ip] = d_sin_s[read+ip] ;
-                sh_axis[ip] = d_axis_s[read+ip] ;
-            }
-            read=read+256; // 256=16*16 block size
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        pcos = sh_cos[256-read + proj] ;
-        psin = sh_sin[256-read + proj] ;
-
-        acorr05 = sh_axis[256 - read + proj] ;
-
-        h0 = (acorr05 + bx00*pcos - by00*psin);
-        h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
-        h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
-        h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
-
-
-        float x;
-        int ym, xm, xp;
-        ym = proj;
-        float2 vals;
-
-        if(h0>=0 && h0<num_bins) {
-            x = CLIP_MAX(h0, num_bins);
-            FLOORCEIL_x(x);
-            vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
-            res0 += linear_interpolation(vals, x, xm, xp);
-        }
-        if(h1>=0 && h1<num_bins) {
-            x = CLIP_MAX(h1, num_bins);
-            FLOORCEIL_x(x);
-            vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
-            res1 += linear_interpolation(vals, x, xm, xp);
-        }
-        if(h2>=0 && h2<num_bins) {
-            x = CLIP_MAX(h2, num_bins);
-            FLOORCEIL_x(x);
-            vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
-            res2 += linear_interpolation(vals, x, xm, xp);
-        }
-        if(h3>=0 && h3<num_bins) {
-            x = CLIP_MAX(h3, num_bins);
-            FLOORCEIL_x(x);
-            vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
-            res3 += linear_interpolation(vals, x, xm, xp);
-        }
-    }
-    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
-    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
-    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
-    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
-}
-
diff --git a/silx/resources/opencl/backproj_helper.cl b/silx/resources/opencl/backproj_helper.cl
deleted file mode 100644
index b1590f8..0000000
--- a/silx/resources/opencl/backproj_helper.cl
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *   Project: silx: backprojection helper functions
- *
- *   Copyright (C) 2016-2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: P. Paleo
- *                      J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-kernel void mult(   global float2* d_sino,
-                    global float2* d_filter,
-                    int num_bins,
-                    int num_projs)
-{
-    int gid0 = get_global_id(0);
-    int gid1 = get_global_id(1);
-    if (gid0 < num_bins && gid1 < num_projs)
-    {
-        // d_sino[gid1*num_bins+gid0] *= d_filter[gid0];
-        d_sino[gid1*num_bins+gid0].x *= d_filter[gid0].x;
-        d_sino[gid1*num_bins+gid0].y *= d_filter[gid0].x;
-      }
-}
-
-// copy only the real part of the valid data to the real array
-kernel void cpy2d_c2r(
-                      global float* d_sino,
-                      global float2* d_sino_complex,
-                      int num_bins,
-                      int num_projs,
-                      int fft_size)
-{
-    int gid0 = get_global_id(0);
-    int gid1 = get_global_id(1);
-    if (gid0 < num_bins && gid1 < num_projs) {
-        d_sino[gid1*num_bins+gid0] = d_sino_complex[gid1*fft_size+gid0].x;
-    }
-}
-
-
-
-
-
-
-
diff --git a/silx/resources/opencl/bitonic.cl b/silx/resources/opencl/bitonic.cl
deleted file mode 100644
index 4096ce8..0000000
--- a/silx/resources/opencl/bitonic.cl
+++ /dev/null
@@ -1,569 +0,0 @@
-/*############################################################################
-# Sort elements within a vector by Matthew Scarpino,
-# Taken from his book "OpenCL in Action",
-# November 2011 ISBN 9781617290176
-# Original license for the code: "public domain"
-#
-# Originally this code is public domain. The MIT license has been added
-# by J. Kieffer (jerome.kieffer@esrf.eu) to provide a disclaimer.
-# J. Kieffer does not claim authorship of this code developed by .
-#
-# Copyright (c) 2011 Matthew Scarpino
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-#
-#############################################################################*/
-
-#define VECTOR_SORT_BOOK(input, dir) \
-        comp = abs(input > shuffle(input, mask2)) ^ dir; \
-        input = shuffle(input, comp * 2 + add2); \
-        comp = abs(input > shuffle(input, mask1)) ^ dir; \
-        input = shuffle(input, comp + add1); \
-
-
-#define VECTOR_SWAP_BOOK(in1, in2, dir) \
-        input1 = in1; input2 = in2; \
-        comp = (abs(input1 > input2) ^ dir) * 4 + add3; \
-        in1 = shuffle2(input1, input2, comp); \
-        in2 = shuffle2(input2, input1, comp); \
-
-
-// The _FILE extension correspond to the formula found in the "OpenCL in Action" supplementary files
-#define VECTOR_SORT_FILE(input, dir)\
-        comp = (input < shuffle(input, mask2)) ^ dir;\
-        input = shuffle(input, as_uint4(comp * 2 + add2));\
-        comp = (input < shuffle(input, mask1)) ^ dir;\
-        input = shuffle(input, as_uint4(comp + add1));\
-
-
-#define VECTOR_SWAP_FILE(input1, input2, dir)\
-        temp = input1;\
-        comp = ((input1 < input2) ^ dir) * 4 + add3;\
-        input1 = shuffle2(input1, input2, as_uint4(comp));\
-        input2 = shuffle2(input2, temp, as_uint4(comp));\
-
-
-
-// Functions to be called from an actual kernel.
-
-static float8 my_sort_file(uint local_id, uint group_id, uint local_size,
-                           float8 input, local float4 *l_data)
-{
-    float4 input1, input2, temp;
-    float8 output;
-
-    int dir;
-    uint id, size, stride;
-    int4 comp;
-
-    uint4 mask1 = (uint4)(1, 0, 3, 2);
-    uint4 mask2 = (uint4)(2, 3, 0, 1);
-    uint4 mask3 = (uint4)(3, 2, 1, 0);
-
-    int4 add1 = (int4)(1, 1, 3, 3);
-    int4 add2 = (int4)(2, 3, 2, 3);
-    int4 add3 = (int4)(1, 2, 2, 3);
-
-    // retrieve input data
-    input1 = (float4)(input.s0, input.s1, input.s2, input.s3);
-    input2 = (float4)(input.s4, input.s5, input.s6, input.s7);
-
-    // Find global address
-    id = local_id * 2;
-
-    /* Sort input 1 - ascending */
-    comp = input1 < shuffle(input1, mask1);
-    input1 = shuffle(input1, as_uint4(comp + add1));
-    comp = input1 < shuffle(input1, mask2);
-    input1 = shuffle(input1, as_uint4(comp * 2 + add2));
-    comp = input1 < shuffle(input1, mask3);
-    input1 = shuffle(input1, as_uint4(comp + add3));
-
-    /* Sort input 2 - descending */
-    comp = input2 > shuffle(input2, mask1);
-    input2 = shuffle(input2, as_uint4(comp + add1));
-    comp = input2 > shuffle(input2, mask2);
-    input2 = shuffle(input2, as_uint4(comp * 2 + add2));
-    comp = input2 > shuffle(input2, mask3);
-    input2 = shuffle(input2, as_uint4(comp + add3));
-
-    /* Swap corresponding elements of input 1 and 2 */
-    add3 = (int4)(4, 5, 6, 7);
-    dir = - (int) (local_id % 2);
-    temp = input1;
-    comp = ((input1 < input2) ^ dir) * 4 + add3;
-    input1 = shuffle2(input1, input2, as_uint4(comp));
-    input2 = shuffle2(input2, temp, as_uint4(comp));
-
-    /* Sort data and store in local memory */
-    VECTOR_SORT_FILE(input1, dir);
-    VECTOR_SORT_FILE(input2, dir);
-    l_data[id] = input1;
-    l_data[id+1] = input2;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    
-    /* Create bitonic set */
-    for(size = 2; size < local_size; size <<= 1) {
-      dir = - (int) (local_id/size & 1) ;
-
-      for(stride = size; stride > 1; stride >>= 1) {
-         barrier(CLK_LOCAL_MEM_FENCE);
-         id = local_id + (local_id/stride)*stride;
-         VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
-      }
-
-      barrier(CLK_LOCAL_MEM_FENCE);
-      id = local_id * 2;
-      input1 = l_data[id];
-      input2 = l_data[id+1];
-      temp = input1;
-      comp = ((input1 < input2) ^ dir) * 4 + add3;
-      input1 = shuffle2(input1, input2, as_uint4(comp));
-      input2 = shuffle2(input2, temp, as_uint4(comp));
-      VECTOR_SORT_FILE(input1, dir);
-      VECTOR_SORT_FILE(input2, dir);
-      l_data[id] = input1;
-      l_data[id+1] = input2;
-      barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    /* Perform bitonic merge */
-    dir = - (int) (group_id % 2);
-    for(stride = local_size; stride > 1; stride >>= 1) 
-    {
-      barrier(CLK_LOCAL_MEM_FENCE);
-      id = local_id + (local_id/stride)*stride;
-      VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* Perform final sort */
-    id = local_id * 2;
-    input1 = l_data[id]; 
-    input2 = l_data[id+1];
-    temp = input1;
-    comp = ((input1 < input2) ^ dir) * 4 + add3;
-    input1 = shuffle2(input1, input2, as_uint4(comp));
-    input2 = shuffle2(input2, temp, as_uint4(comp));
-    VECTOR_SORT_FILE(input1, dir);
-    VECTOR_SORT_FILE(input2, dir);
-
-    // setup output and return it
-    output = (float8)(input1, input2);
-    return  output;
-}
-
-static float8 my_sort_book(uint local_id, uint group_id, uint local_size,
-                           float8 input, local float4 *l_data)
-{
-    float4 input1, input2, temp;
-    float8 output;
-    uint4 comp, swap, mask1, mask2, add1, add2, add3;
-    uint id, dir, size, stride;
-    mask1 = (uint4)(1, 0, 3, 2);
-    swap = (uint4)(0, 0, 1, 1);
-    add1 = (uint4)(0, 0, 2, 2);
-    mask2 = (uint4)(2, 3, 0, 1);
-    add2 = (uint4)(0, 1, 0, 1);
-    add3 = (uint4)(0, 1, 2, 3);
-
-    // retrieve input data
-    input1 = (float4)(input.s0, input.s1, input.s2, input.s3);
-    input2 = (float4)(input.s4, input.s5, input.s6, input.s7);
-
-    // Find global address
-    id = local_id * 2;
-
-    //Sort first vector
-
-    comp = abs(input1 > shuffle(input1, mask1));
-    input1 = shuffle(input1, comp ^ swap + add1);
-    comp = abs(input1 > shuffle(input1, mask2));
-    input1 = shuffle(input1, comp * 2 + add2);
-    comp = abs(input1 > shuffle(input1, mask1));
-    input1 = shuffle(input1, comp + add1);
-
-    //Sort second vector
-    comp = abs(input2 < shuffle(input2, mask1));
-    input2 = shuffle(input2, comp ^ swap + add1);
-    comp = abs(input2 < shuffle(input2, mask2));
-    input2 = shuffle(input2, comp * 2 + add2);
-    comp = abs(input2 < shuffle(input2, mask1));
-    input2 = shuffle(input2, comp + add1);
-
-    // Swap elements
-    dir = local_id % 2;
-    temp = input1;
-    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
-    input1 = shuffle2(input1, input2, comp);
-    input2 = shuffle2(input2, temp, comp);
-    VECTOR_SORT_BOOK(input1, dir);
-    VECTOR_SORT_BOOK(input2, dir);
-    l_data[id] = input1;
-    l_data[id+1] = input2;
-
-    // Perform upper stages
-    for(size = 2; size < local_size;    size <<= 1) 
-    {
-        dir = local_id/size & 1;
-
-        //Perform    lower stages
-        for(stride = size; stride > 1; stride >>= 1) 
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            id = local_id +    (local_id/stride)*stride;
-            VECTOR_SWAP_BOOK(l_data[id],    l_data[id + stride], dir)
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        id = local_id * 2;
-        input1 = l_data[id];
-        input2 = l_data[id+1];
-        temp = input1;
-        comp = (abs(input1 > input2) ^ dir) * 4 + add3;
-        input1 = shuffle2(input1, input2, comp);
-        input2 = shuffle2(input2, temp, comp);
-        VECTOR_SORT_BOOK(input1, dir);
-        VECTOR_SORT_BOOK(input2, dir);
-        l_data[id] = input1;
-        l_data[id+1] = input2;
-    }
-    dir = group_id % 2;
-
-    // Perform bitonic merge
-    for(stride = local_size; stride > 1; stride >>= 1) 
-    {
-        barrier(CLK_LOCAL_MEM_FENCE);
-        id = local_id +    (local_id/stride)*stride;
-        VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    id = local_id * 2;
-    input1 = l_data[id]; input2 = l_data[id+1];
-    temp = input1;
-    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
-    input1 = shuffle2(input1, input2, comp);
-    input2 = shuffle2(input2, temp, comp);
-    VECTOR_SORT_BOOK(input1, dir);
-    VECTOR_SORT_BOOK(input2, dir);
-
-    // setup output and return it
-    output = (float8)(input1, input2);
-    return  output;
-}
-
-
-
-//////////////
-// Kernels
-//////////////
-
-// Perform the sort on the whole array
-// dim0: wg=number_of_element/8
-
-kernel void bsort_all(global float4 *g_data,
-                      local float4 *l_data)
-{
-    float4 input1, input2;
-    float8 input, output;
-    uint id, global_start;
-    // Find global address
-    id = get_local_id(0) * 2;
-    global_start = get_group_id(0) * get_local_size(0) * 2 + id;
-
-    input1 = g_data[global_start];
-    input2 = g_data[global_start+1];
-    input = (float8)(input1, input2);
-    output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
-                          input, l_data);
-    input1 = (float4) (output.s0, output.s1, output.s2, output.s3);
-    input2 = (float4) (output.s4, output.s5, output.s6, output.s7);
-    g_data[global_start] = input1;
-    g_data[global_start+1] = input2;
-}
-
-
-// Perform the sort along the horizontal axis of a 2D image
-// dim0 = y: wg=1
-// dim1 = x: wg=number_of_element/8
-kernel void bsort_horizontal(global float *g_data,
-                             local float4 *l_data) 
-{
-    float8 input, output;
-    uint id, global_start, offset;
-
-    // Find global address
-    offset = get_global_size(1)*get_global_id(0)*8;
-    id = get_local_id(1) * 8;
-    global_start = offset + get_group_id(1) * get_local_size(1) * 8 + id;
-
-    input = (float8)(g_data[global_start    ],
-                     g_data[global_start + 1],
-                     g_data[global_start + 2],
-                     g_data[global_start + 3],
-                     g_data[global_start + 4],
-                     g_data[global_start + 5],
-                     g_data[global_start + 6],
-                     g_data[global_start + 7]);
-
-    output = my_sort_file(get_local_id(1), get_group_id(1), get_local_size(1),
-                          input, l_data);
-
-    g_data[global_start    ] = output.s0;
-    g_data[global_start + 1] = output.s1;
-    g_data[global_start + 2] = output.s2;
-    g_data[global_start + 3] = output.s3;
-    g_data[global_start + 4] = output.s4;
-    g_data[global_start + 5] = output.s5;
-    g_data[global_start + 6] = output.s6;
-    g_data[global_start + 7] = output.s7;
-}
-
-
-// Perform the sort along the vertical axis
-// dim0 = y: wg=number_of_element/8
-// dim1 = x: wg=1
-// check if transposing +bsort_horizontal is not more efficient ?
-
-kernel void bsort_vertical(global float *g_data,
-                           local float4 *l_data) 
-{
-    // we need to read 8 float position along the vertical axis
-    float8 input, output;
-    uint id, global_start, padding;
-
-    // Find global address
-    padding = get_global_size(1);
-    id = get_local_id(0) * 8 * padding + get_global_id(1);
-    global_start = get_group_id(0) * get_local_size(0) * 8 * padding + id;
-
-    input = (float8)(g_data[global_start            ],
-                     g_data[global_start + padding  ],
-                     g_data[global_start + 2*padding],
-                     g_data[global_start + 3*padding],
-                     g_data[global_start + 4*padding],
-                     g_data[global_start + 5*padding],
-                     g_data[global_start + 6*padding],
-                     g_data[global_start + 7*padding]);
-
-      output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
-                            input, l_data);
-      g_data[global_start             ] = output.s0;
-      g_data[global_start + padding   ] = output.s1;
-      g_data[global_start + 2*padding ] = output.s2;
-      g_data[global_start + 3*padding ] = output.s3;
-      g_data[global_start + 4*padding ] = output.s4;
-      g_data[global_start + 5*padding ] = output.s5;
-      g_data[global_start + 6*padding ] = output.s6;
-      g_data[global_start + 7*padding ] = output.s7;
-}
-
-
-//Tested working reference kernel frm the book. This only works under Linux
-kernel void bsort_book(global float4 *g_data,
-                         local float4 *l_data) {
-    float4 input1, input2, temp;
-    uint4 comp, swap, mask1, mask2, add1, add2, add3;
-    uint id, dir, global_start, size, stride;
-    mask1 = (uint4)(1, 0, 3, 2);
-    swap = (uint4)(0, 0, 1, 1);
-    add1 = (uint4)(0, 0, 2, 2);
-    mask2 = (uint4)(2, 3, 0, 1);
-    add2 = (uint4)(0, 1, 0, 1);
-    add3 = (uint4)(0, 1, 2, 3);
-
-    // Find global address
-    id = get_local_id(0) * 2;
-    global_start = get_group_id(0) * get_local_size(0) * 2 + id;
-
-    //Sort first vector
-    input1 = g_data[global_start];
-    input2 = g_data[global_start+1];
-    comp = abs(input1 > shuffle(input1, mask1));
-    input1 = shuffle(input1, comp ^ swap + add1);
-    comp = abs(input1 > shuffle(input1, mask2));
-    input1 = shuffle(input1, comp * 2 + add2);
-    comp = abs(input1 > shuffle(input1, mask1));
-    input1 = shuffle(input1, comp + add1);
-
-    //Sort second vector
-    comp = abs(input2 < shuffle(input2, mask1));
-    input2 = shuffle(input2, comp ^ swap + add1);
-    comp = abs(input2 < shuffle(input2, mask2));
-    input2 = shuffle(input2, comp * 2 + add2);
-    comp = abs(input2 < shuffle(input2, mask1));
-    input2 = shuffle(input2, comp + add1);
-
-    // Swap elements
-    dir = get_local_id(0) % 2;
-    temp = input1;
-    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
-    input1 = shuffle2(input1, input2, comp);
-    input2 = shuffle2(input2, temp, comp);
-    VECTOR_SORT_BOOK(input1, dir);
-    VECTOR_SORT_BOOK(input2, dir);
-    l_data[id] = input1;
-    l_data[id+1] = input2;
-
-    // Perform upper stages
-    for(size = 2; size < get_local_size(0);    size <<= 1) {
-        dir = get_local_id(0)/size & 1;
-
-        //Perform    lower stages
-        for(stride = size; stride > 1; stride >>= 1) {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            id = get_local_id(0) +
-            (get_local_id(0)/stride)*stride;
-            VECTOR_SWAP_BOOK(l_data[id],    l_data[id + stride], dir)
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        id = get_local_id(0) * 2;
-        input1 = l_data[id];
-        input2 = l_data[id+1];
-        temp = input1;
-        comp = (abs(input1 > input2) ^ dir) * 4 + add3;
-        input1 = shuffle2(input1, input2, comp);
-        input2 = shuffle2(input2, temp, comp);
-        VECTOR_SORT_BOOK(input1, dir);
-        VECTOR_SORT_BOOK(input2, dir);
-        l_data[id] = input1;
-        l_data[id+1] = input2;
-    }
-    dir = get_group_id(0) % 2;
-    // Perform bitonic merge
-    for(stride = get_local_size(0); stride > 1;    stride >>= 1) {
-        barrier(CLK_LOCAL_MEM_FENCE);
-        id = get_local_id(0) +
-        (get_local_id(0)/stride)*stride;
-        VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    id = get_local_id(0) * 2;
-    input1 = l_data[id]; input2 = l_data[id+1];
-    temp = input1;
-    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
-    input1 = shuffle2(input1, input2, comp);
-    input2 = shuffle2(input2, temp, comp);
-    VECTOR_SORT_BOOK(input1, dir);
-    VECTOR_SORT_BOOK(input2, dir);
-    g_data[global_start] = input1;
-    g_data[global_start+1] = input2;
-    }
-
-//Tested working reference kernel from the addition files. This only works under any operating system
-/* Perform initial sort */
-kernel void bsort_file(global float4 *g_data, local float4 *l_data) {
-
-   int dir;
-   uint id, global_start, size, stride;
-   float4 input1, input2, temp;
-   int4 comp;
-
-   uint4 mask1 = (uint4)(1, 0, 3, 2);
-   uint4 mask2 = (uint4)(2, 3, 0, 1);
-   uint4 mask3 = (uint4)(3, 2, 1, 0);
-
-   int4 add1 = (int4)(1, 1, 3, 3);
-   int4 add2 = (int4)(2, 3, 2, 3);
-   int4 add3 = (int4)(1, 2, 2, 3);
-
-   id = get_local_id(0) * 2;
-   global_start = get_group_id(0) * get_local_size(0) * 2 + id;
-
-   input1 = g_data[global_start];
-   input2 = g_data[global_start+1];
-
-   /* Sort input 1 - ascending */
-   comp = input1 < shuffle(input1, mask1);
-   input1 = shuffle(input1, as_uint4(comp + add1));
-   comp = input1 < shuffle(input1, mask2);
-   input1 = shuffle(input1, as_uint4(comp * 2 + add2));
-   comp = input1 < shuffle(input1, mask3);
-   input1 = shuffle(input1, as_uint4(comp + add3));
-
-   /* Sort input 2 - descending */
-   comp = input2 > shuffle(input2, mask1);
-   input2 = shuffle(input2, as_uint4(comp + add1));
-   comp = input2 > shuffle(input2, mask2);
-   input2 = shuffle(input2, as_uint4(comp * 2 + add2));
-   comp = input2 > shuffle(input2, mask3);
-   input2 = shuffle(input2, as_uint4(comp + add3));
-
-   /* Swap corresponding elements of input 1 and 2 */
-   add3 = (int4)(4, 5, 6, 7);
-   dir = - (int)(get_local_id(0) % 2);
-   temp = input1;
-   comp = ((input1 < input2) ^ dir) * 4 + add3;
-   input1 = shuffle2(input1, input2, as_uint4(comp));
-   input2 = shuffle2(input2, temp, as_uint4(comp));
-
-   /* Sort data and store in local memory */
-   VECTOR_SORT_FILE(input1, dir);
-   VECTOR_SORT_FILE(input2, dir);
-   l_data[id] = input1;
-   l_data[id+1] = input2;
-
-   /* Create bitonic set */
-   for(size = 2; size < get_local_size(0); size <<= 1) {
-      dir = - (int)(get_local_id(0)/size & 1);
-
-      for(stride = size; stride > 1; stride >>= 1) {
-         barrier(CLK_LOCAL_MEM_FENCE);
-         id = get_local_id(0) + (get_local_id(0)/stride)*stride;
-         VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
-      }
-
-      barrier(CLK_LOCAL_MEM_FENCE);
-      id = get_local_id(0) * 2;
-      input1 = l_data[id]; input2 = l_data[id+1];
-      temp = input1;
-      comp = ((input1 < input2) ^ dir) * 4 + add3;
-      input1 = shuffle2(input1, input2, as_uint4(comp));
-      input2 = shuffle2(input2, temp, as_uint4(comp));
-      VECTOR_SORT_FILE(input1, dir);
-      VECTOR_SORT_FILE(input2, dir);
-      l_data[id] = input1;
-      l_data[id+1] = input2;
-   }
-
-   /* Perform bitonic merge */
-   dir = - (int)(get_group_id(0) % 2);
-   for(stride = get_local_size(0); stride > 1; stride >>= 1) {
-      barrier(CLK_LOCAL_MEM_FENCE);
-      id = get_local_id(0) + (get_local_id(0)/stride)*stride;
-      VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   /* Perform final sort */
-   id = get_local_id(0) * 2;
-   input1 = l_data[id]; input2 = l_data[id+1];
-   temp = input1;
-   comp = ((input1 < input2) ^ dir) * 4 + add3;
-   input1 = shuffle2(input1, input2, as_uint4(comp));
-   input2 = shuffle2(input2, temp, as_uint4(comp));
-   VECTOR_SORT_FILE(input1, dir);
-   VECTOR_SORT_FILE(input2, dir);
-   g_data[global_start] = input1;
-   g_data[global_start+1] = input2;
-}
-
diff --git a/silx/resources/opencl/codec/byte_offset.cl b/silx/resources/opencl/codec/byte_offset.cl
deleted file mode 100644
index 56a24c4..0000000
--- a/silx/resources/opencl/codec/byte_offset.cl
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- *   Project: SILX: A data analysis tool-kit
- *
- *   Copyright (C) 2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/* To decompress CBF byte-offset compressed in parallel on GPU one needs to:
- * - Set all values in mask and exception counter to zero.
- * - Mark regions with exceptions and set values without exception.
- *   This generates the values (zeros for exceptions), the exception mask,
- *   counts the number of exception region and provides a start position for
- *   each exception.
- * - Treat exceptions. For this, one thread in a workgoup treats a complete
- *   masked region in a serial fashion. All regions are treated in parallel.
- *   Values written at this stage are marked in the mask with -1.
- * - Double scan: inclusive cum sum for values, exclusive cum sum to generate
- *   indices in output array. Values with mask = 1 are considered as 0.
- * - Compact and copy output by removing duplicated values in exceptions.
- */
-
-kernel void mark_exceptions(global char* raw,
-                            int size,
-                            int full_size,
-                            global int* mask,
-                            global int* values,
-                            global int* cnt,
-                            global int* exc)
-{
-    int gid;
-    gid = get_global_id(0);
-    if (gid<size)
-    {
-        int value, position;
-        value = raw[gid];
-        if (value == -128)
-        {
-            int maxi;
-            values[gid] = 0;
-            position = atomic_inc(cnt);
-            exc[position] = gid;
-            maxi = size - 1;
-            mask[gid] = 1;
-            mask[min(maxi, gid+1)] = 1;
-            mask[min(maxi, gid+2)] = 1;
-
-            if (((int) raw[min(gid+1, maxi)] == 0) &&
-                ((int) raw[min(gid+2, maxi)] == -128))
-            {
-                mask[min(maxi, gid+3)] = 1;
-                mask[min(maxi, gid+4)] = 1;
-                mask[min(maxi, gid+5)] = 1;
-                mask[min(maxi, gid+6)] = 1;
-            }
-        }
-        else
-        { // treat simple data
-
-            values[gid] = value;
-        }
-    }
-    else if (gid<full_size)
-    {
-        mask[gid]=1;
-        values[gid] = 0;
-    }
-}
-
-//run with WG=1, as may as exceptions
-kernel void treat_exceptions(global char* raw,  //raw compressed stream
-                             int size,          //size of the raw compressed stream
-                             global int* mask,  //tells if the value is masked
-                             global int* exc,   //array storing the position of the start of exception zones
-                             global int* values)// stores decompressed values.
-{
-    int gid = get_global_id(0);
-    int inp_pos = exc[gid];
-    if ((inp_pos<=0) || ((int)mask[inp_pos - 1] == 0))
-    {
-        int value, is_masked, next_value, inc;
-        is_masked = (mask[inp_pos] != 0);
-        while ((is_masked) && (inp_pos<size))
-        {
-            value = (int) raw[inp_pos];
-            if (value == -128)
-            { // this correspond to 16 bits exception
-                uchar low_byte = raw[inp_pos+1];
-                char high_byte = raw[inp_pos+2] ;
-                next_value = high_byte<<8 | low_byte;
-                if (next_value == -32768)
-                { // this correspond to 32 bits exception
-                    uchar low_byte1 = raw[inp_pos+3],
-                          low_byte2 = raw[inp_pos+4],
-                          low_byte3 = raw[inp_pos+5];
-                    char high_byte4 = raw[inp_pos+6] ;
-                    value = high_byte4<<24 | low_byte3<<16 | low_byte2<<8 | low_byte1;
-                    inc = 7;
-                }
-                else
-                {
-                    value = next_value;
-                    inc = 3;
-                }
-            }
-            else
-            {
-                inc = 1;
-            }
-            values[inp_pos] = value;
-            mask[inp_pos] = -1; // mark the processed data as valid in the mask
-            inp_pos += inc;
-            is_masked = (mask[inp_pos] != 0);
-        }
-    }
-}
-
-// copy the values of the elements to definitive position
-kernel void copy_result_int(global int* values,
-                            global int* indexes,
-                            int in_size,
-                            int out_size,
-                            global int* output
-                            )
-{
-    int gid = get_global_id(0);
-    if (gid < in_size)
-    {
-        int current = max(indexes[gid], 0),
-               next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1];
-        //we keep always the last element
-        if ((current <= out_size) && (current < next))
-        {
-            output[current] = values[gid];
-        }
-    }
-}
-
-// copy the values of the elements to definitive position
-kernel void copy_result_float(global int* values,
-                              global int* indexes,
-                              int in_size,
-                              int out_size,
-                              global float* output
-                              )
-{
-    int gid = get_global_id(0);
-    if (gid<in_size)
-    {
-        int current = max(indexes[gid], 0),
-               next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1];
-        if ((current < out_size) && (current < next))
-        {
-            output[current] = (float) values[gid];
-        }
-    }
-}
-
-
-// combined memset for all arrays used for Byte Offset decompression
-kernel void byte_offset_memset(global char* raw,
-                               global int* mask,
-                               global int* index,
-                               global int* result,
-                               int full_size,
-                               int actual_size
-                              )
-{
-    int gid = get_global_id(0);
-    if (gid < full_size)
-    {
-        raw[gid] = 0;
-        index[gid] = 0;
-        result[gid] = 0;
-        if (gid<actual_size)
-        {
-            mask[gid] = 0;
-        }
-        else
-        {
-            mask[gid] = 1;
-        }
-
-    }
-}
-
-
-//Simple memset kernel for char arrays
-kernel void fill_char_mem(global char* ary,
-                          int size,
-                          char pattern,
-                          int start_at)
-{
-    int gid = get_global_id(0);
-    if ((gid >= start_at) && (gid < size))
-    {
-        ary[gid] = pattern;
-    }
-}
-
-//Simple memset kernel for int arrays
-kernel void fill_int_mem(global int* ary,
-                         int size,
-                         int pattern,
-                         int start_at)
-{
-    int gid = get_global_id(0);
-    if ((gid >= start_at) && (gid < size))
-    {
-        ary[gid] = pattern;
-    }
-}
-
diff --git a/silx/resources/opencl/convolution.cl b/silx/resources/opencl/convolution.cl
deleted file mode 100644
index 629b8fc..0000000
--- a/silx/resources/opencl/convolution.cl
+++ /dev/null
@@ -1,312 +0,0 @@
-#define MAX_CONST_SIZE 16384
-
-/******************************************************************************/
-/**************************** Macros ******************************************/
-/******************************************************************************/
-
-// Get the center index of the filter,
-// and the "half-Left" and "half-Right" lengths.
-// In the case of an even-sized filter, the center is shifted to the left.
-#define GET_CENTER_HL(hlen){\
-        if (hlen & 1) {\
-            c = hlen/2;\
-            hL = c;\
-            hR = c;\
-        }\
-        else {\
-            c = hlen/2 - 1;\
-            hL = c;\
-            hR = c+1;\
-        }\
-}\
-
-// Boundary handling modes
-#define CONV_MODE_REFLECT 0 // cba|abcd|dcb
-#define CONV_MODE_NEAREST 1 // aaa|abcd|ddd
-#define CONV_MODE_WRAP 2 // bcd|abcd|abc
-#define CONV_MODE_CONSTANT 3 // 000|abcd|000
-#ifndef USED_CONV_MODE
-    #define USED_CONV_MODE CONV_MODE_NEAREST
-#endif
-
-#define CONV_PERIODIC_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x += Nx; if (idx_x >= Nx) idx_x -= Nx;
-#define CONV_PERIODIC_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y += Ny; if (idx_y >= Ny) idx_y -= Ny;
-#define CONV_PERIODIC_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z += Nz; if (idx_z >= Nz) idx_z -= Nz;
-
-#define CONV_NEAREST_IDX_X int idx_x = clamp((int) (gidx - c + jx), 0, Nx-1);
-#define CONV_NEAREST_IDX_Y int idx_y = clamp((int) (gidy - c + jy), 0, Ny-1);
-#define CONV_NEAREST_IDX_Z int idx_z = clamp((int) (gidz - c + jz), 0, Nz-1);
-
-#define CONV_REFLECT_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x = -idx_x-1; if (idx_x >= Nx) idx_x = Nx-(idx_x-(Nx-1));
-#define CONV_REFLECT_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y = -idx_y-1; if (idx_y >= Ny) idx_y = Ny-(idx_y-(Ny-1));
-#define CONV_REFLECT_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z = -idx_z-1; if (idx_z >= Nz) idx_z = Nz-(idx_z-(Nz-1));
-
-
-#if USED_CONV_MODE == CONV_MODE_REFLECT
-    #define CONV_IDX_X CONV_REFLECT_IDX_X
-    #define CONV_IDX_Y CONV_REFLECT_IDX_Y
-    #define CONV_IDX_Z CONV_REFLECT_IDX_Z
-#elif USED_CONV_MODE == CONV_MODE_NEAREST
-    #define CONV_IDX_X CONV_NEAREST_IDX_X
-    #define CONV_IDX_Y CONV_NEAREST_IDX_Y
-    #define CONV_IDX_Z CONV_NEAREST_IDX_Z
-#elif USED_CONV_MODE == CONV_MODE_WRAP
-    #define CONV_IDX_X CONV_PERIODIC_IDX_X
-    #define CONV_IDX_Y CONV_PERIODIC_IDX_Y
-    #define CONV_IDX_Z CONV_PERIODIC_IDX_Z
-#elif USED_CONV_MODE == CONV_MODE_CONSTANT
-    #error "constant not implemented yet"
-#else
-    #error "Unknown convolution mode"
-#endif
-
-
-
-// Image access patterns
-#define READ_IMAGE_1D_X input[(gidz*Ny + gidy)*Nx + idx_x]
-#define READ_IMAGE_1D_Y input[(gidz*Ny + idx_y)*Nx + gidx]
-#define READ_IMAGE_1D_Z input[(idx_z*Ny + gidy)*Nx + gidx]
-
-#define READ_IMAGE_2D_XY input[(gidz*Ny + idx_y)*Nx + idx_x]
-#define READ_IMAGE_2D_XZ input[(idx_z*Ny + gidy)*Nx + idx_x]
-#define READ_IMAGE_2D_YZ input[(idx_z*Ny + idx_y)*Nx + gidx]
-
-#define READ_IMAGE_3D_XYZ input[(idx_z*Ny + idx_y)*Nx + idx_x]
-
-
-
-/******************************************************************************/
-/**************************** 1D Convolution **********************************/
-/******************************************************************************/
-
-
-// Convolution with 1D kernel along axis "X" (fast dimension)
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "X".
-__kernel void convol_1D_X(
-    const __global float * input,
-    __global float * output,
-    __global float * filter,
-    int L, // filter size
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(L);
-    float sum = 0.0f;
-
-    for (int jx = 0; jx <= hR+hL; jx++) {
-        CONV_IDX_X; // Get index "x"
-        sum += READ_IMAGE_1D_X * filter[L-1 - jx];
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-// Convolution with 1D kernel along axis "Y"
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y".
-__kernel void convol_1D_Y(
-    const __global float * input,
-    __global float * output,
-    __global float * filter,
-    int L, // filter size
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(L);
-    float sum = 0.0f;
-
-    for (int jy = 0; jy <= hR+hL; jy++) {
-        CONV_IDX_Y; // Get index "y"
-        sum += READ_IMAGE_1D_Y * filter[L-1 - jy];
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-// Convolution with 1D kernel along axis "Z"
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z".
-__kernel void convol_1D_Z(
-    const __global float * input,
-    __global float * output,
-    __global float * filter,
-    int L, // filter size
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(L);
-    float sum = 0.0f;
-
-    for (int jz = 0; jz <= hR+hL; jz++) {
-        CONV_IDX_Z; // Get index "z"
-        sum += READ_IMAGE_1D_Z * filter[L-1 - jz];
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-/******************************************************************************/
-/**************************** 2D Convolution **********************************/
-/******************************************************************************/
-
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_XY(
-    const __global float * input,
-    __global float * output,
-    __global float * filter,
-    int Lx, // filter number of columns,
-    int Ly, // filter number of rows,
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Lx);
-    float sum = 0.0f;
-
-    for (int jy = 0; jy <= hR+hL; jy++) {
-        CONV_IDX_Y; // Get index "y"
-        for (int jx = 0; jx <= hR+hL; jx++) {
-            CONV_IDX_X; // Get index "x"
-            sum += READ_IMAGE_2D_XY * filter[(Ly-1-jy)*Lx + (Lx-1 - jx)];
-        }
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_XZ(
-    const __global float * input,
-    __global float * output,
-    __global float * filter,
-    int Lx, // filter number of columns,
-    int Lz, // filter number of rows,
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Lx);
-    float sum = 0.0f;
-
-    for (int jz = 0; jz <= hR+hL; jz++) {
-        CONV_IDX_Z; // Get index "z"
-        for (int jx = 0; jx <= hR+hL; jx++) {
-            CONV_IDX_X; // Get index "x"
-            sum += READ_IMAGE_2D_XZ * filter[(Lz-1-jz)*Lx + (Lx-1 - jx)];
-        }
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_YZ(
-    const __global float * input,
-    __global float * output,
-    __global float * filter,
-    int Ly, // filter number of columns,
-    int Lz, // filter number of rows,
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Ly);
-    float sum = 0.0f;
-
-    for (int jz = 0; jz <= hR+hL; jz++) {
-        CONV_IDX_Z; // Get index "z"
-        for (int jy = 0; jy <= hR+hL; jy++) {
-            CONV_IDX_Y; // Get index "y"
-            sum += READ_IMAGE_2D_YZ * filter[(Lz-1-jz)*Ly + (Ly-1 - jy)];
-        }
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-
-/******************************************************************************/
-/**************************** 3D Convolution **********************************/
-/******************************************************************************/
-
-// Convolution with 3D kernel
-__kernel void convol_3D_XYZ(
-    const __global float * input,
-    __global float * output,
-    __global float * filter,
-    int Lx, // filter number of columns,
-    int Ly, // filter number of rows,
-    int Lz, // filter number of rows,
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Lx);
-    float sum = 0.0f;
-
-    for (int jz = 0; jz <= hR+hL; jz++) {
-        CONV_IDX_Z; // Get index "z"
-        for (int jy = 0; jy <= hR+hL; jy++) {
-            CONV_IDX_Y; // Get index "y"
-            for (int jx = 0; jx <= hR+hL; jx++) {
-                CONV_IDX_X; // Get index "x"
-                sum += READ_IMAGE_3D_XYZ * filter[((Lz-1-jz)*Ly + (Ly-1-jy))*Lx + (Lx-1 - jx)];
-            }
-        }
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
diff --git a/silx/resources/opencl/convolution_textures.cl b/silx/resources/opencl/convolution_textures.cl
deleted file mode 100644
index 517a67c..0000000
--- a/silx/resources/opencl/convolution_textures.cl
+++ /dev/null
@@ -1,374 +0,0 @@
-/******************************************************************************/
-/**************************** Macros ******************************************/
-/******************************************************************************/
-
-// Error handling
-#ifndef IMAGE_DIMS
-    #error "IMAGE_DIMS must be defined"
-#endif
-#ifndef FILTER_DIMS
-    #error "FILTER_DIMS must be defined"
-#endif
-#if FILTER_DIMS > IMAGE_DIMS
-    #error "Filter cannot have more dimensions than image"
-#endif
-
-// Boundary handling modes
-#define CONV_MODE_REFLECT 0 // CLK_ADDRESS_MIRRORED_REPEAT : cba|abcd|dcb
-#define CONV_MODE_NEAREST 1 // CLK_ADDRESS_CLAMP_TO_EDGE : aaa|abcd|ddd
-#define CONV_MODE_WRAP 2 // CLK_ADDRESS_REPEAT : bcd|abcd|abc
-#define CONV_MODE_CONSTANT 3 // CLK_ADDRESS_CLAMP : 000|abcd|000
-#ifndef USED_CONV_MODE
-    #define USED_CONV_MODE CONV_MODE_NEAREST
-#endif
-#if USED_CONV_MODE == CONV_MODE_REFLECT
-    #define CLK_BOUNDARY CLK_ADDRESS_MIRRORED_REPEAT
-    #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE
-    #define USE_NORM_COORDS
-#elif USED_CONV_MODE == CONV_MODE_NEAREST
-    #define CLK_BOUNDARY CLK_ADDRESS_CLAMP_TO_EDGE
-    #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE
-#elif USED_CONV_MODE == CONV_MODE_WRAP
-    #define CLK_BOUNDARY CLK_ADDRESS_REPEAT
-    #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE
-    #define USE_NORM_COORDS
-#elif USED_CONV_MODE == CONV_MODE_CONSTANT
-    #define CLK_BOUNDARY CLK_ADDRESS_CLAMP
-    #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE
-#else
-    #error "Unknown convolution mode"
-#endif
-
-
-
-// Convolution index for filter
-#define FILTER_INDEX(j) (Lx - 1 - j)
-
-// Filter access patterns
-#define READ_FILTER_1D(j) read_imagef(filter, (int2) (FILTER_INDEX(j), 0)).x;
-#define READ_FILTER_2D(jx, jy) read_imagef(filter, (int2) (FILTER_INDEX(jx), FILTER_INDEX(jy))).x;
-#define READ_FILTER_3D(jx, jy, jz) read_imagef(filter, (int4) (FILTER_INDEX(jx), FILTER_INDEX(jy), FILTER_INDEX(jz), 0)).x;
-
-
-// Convolution index for image
-#ifdef USE_NORM_COORDS
-    #define IMAGE_INDEX_X (gidx*1.0f +0.5f - c + jx)/Nx
-    #define IMAGE_INDEX_Y (gidy*1.0f +0.5f - c + jy)/Ny
-    #define IMAGE_INDEX_Z (gidz*1.0f +0.5f - c + jz)/Nz
-    #define RET_TYPE_1 float
-    #define RET_TYPE_2 float2
-    #define RET_TYPE_4 float4
-    #define C_ZERO 0.5f
-    #define GIDX (gidx*1.0f + 0.5f)/Nx
-    #define GIDY (gidy*1.0f + 0.5f)/Ny
-    #define GIDZ (gidz*1.0f + 0.5f)/Nz
-#else
-    #define IMAGE_INDEX_X (gidx - c + jx)
-    #define IMAGE_INDEX_Y (gidy - c + jy)
-    #define IMAGE_INDEX_Z (gidz - c + jz)
-    #define RET_TYPE_1 int
-    #define RET_TYPE_2 int2
-    #define RET_TYPE_4 int4
-    #define C_ZERO 0
-    #define GIDX gidx
-    #define GIDY gidy
-    #define GIDZ gidz
-#endif
-
-static const sampler_t sampler = CLK_COORDS | CLK_BOUNDARY | CLK_FILTER_NEAREST;
-
-// Image access patterns
-#define READ_IMAGE_1D read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, C_ZERO)).x
-
-#define READ_IMAGE_2D_X read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X , GIDY)).x
-#define READ_IMAGE_2D_Y read_imagef(input, sampler, (RET_TYPE_2) (GIDX, IMAGE_INDEX_Y)).x
-#define READ_IMAGE_2D_XY read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, IMAGE_INDEX_Y)).x
-
-#define READ_IMAGE_3D_X read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, GIDZ, C_ZERO)).x
-#define READ_IMAGE_3D_Y read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x
-#define READ_IMAGE_3D_Z read_imagef(input, sampler, (RET_TYPE_4) (GIDX, GIDY, IMAGE_INDEX_Z, C_ZERO)).x
-#define READ_IMAGE_3D_XY read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x
-#define READ_IMAGE_3D_XZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, IMAGE_INDEX_Z, C_ZERO)).x
-#define READ_IMAGE_3D_YZ read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x
-#define READ_IMAGE_3D_XYZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x
-
-
-// NOTE: pyopencl and OpenCL < 1.2 do not support image1d_t
-#if FILTER_DIMS == 1
-    #define FILTER_TYPE image2d_t
-    #define READ_FILTER_VAL(j) READ_FILTER_1D(j)
-#elif FILTER_DIMS == 2
-    #define FILTER_TYPE image2d_t
-    #define READ_FILTER_VAL(jx, jy) READ_FILTER_2D(jx, jy)
-#elif FILTER_DIMS == 3
-    #define FILTER_TYPE image3d_t
-    #define READ_FILTER_VAL(jx, jy, jz) READ_FILTER_3D(jx, jy, jz)
-#endif
-
-#if IMAGE_DIMS == 1
-    #define IMAGE_TYPE image2d_t
-    #define READ_IMAGE_X READ_IMAGE_1D
-#elif IMAGE_DIMS == 2
-    #define IMAGE_TYPE image2d_t
-    #define READ_IMAGE_X READ_IMAGE_2D_X
-    #define READ_IMAGE_Y READ_IMAGE_2D_Y
-    #define READ_IMAGE_XY READ_IMAGE_2D_XY
-#elif IMAGE_DIMS == 3
-    #define IMAGE_TYPE image3d_t
-    #define READ_IMAGE_X READ_IMAGE_3D_X
-    #define READ_IMAGE_Y READ_IMAGE_3D_Y
-    #define READ_IMAGE_Z READ_IMAGE_3D_Z
-    #define READ_IMAGE_XY READ_IMAGE_3D_XY
-    #define READ_IMAGE_XZ READ_IMAGE_3D_XZ
-    #define READ_IMAGE_YZ READ_IMAGE_3D_YZ
-    #define READ_IMAGE_XYZ READ_IMAGE_3D_XYZ
-#endif
-
-
-// Get the center index of the filter,
-// and the "half-Left" and "half-Right" lengths.
-// In the case of an even-sized filter, the center is shifted to the left.
-#define GET_CENTER_HL(hlen){\
-        if (hlen & 1) {\
-            c = hlen/2;\
-            hL = c;\
-            hR = c;\
-        }\
-        else {\
-            c = hlen/2 - 1;\
-            hL = c;\
-            hR = c+1;\
-        }\
-}\
-
-
-
-/******************************************************************************/
-/**************************** 1D Convolution **********************************/
-/******************************************************************************/
-
-#if FILTER_DIMS == 1
-// Convolution with 1D kernel along axis "X" (fast dimension)
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "X".
-__kernel void convol_1D_X_tex(
-    read_only IMAGE_TYPE input,
-    __global float * output,
-    read_only FILTER_TYPE filter,
-    int Lx, // filter size
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Lx);
-    float sum = 0.0f;
-
-    for (int jx = 0; jx <= hR+hL; jx++) {
-        sum += READ_IMAGE_X * READ_FILTER_VAL(jx);
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-#if IMAGE_DIMS >= 2
-// Convolution with 1D kernel along axis "Y"
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y".
-__kernel void convol_1D_Y_tex(
-    read_only IMAGE_TYPE input,
-    __global float * output,
-    read_only FILTER_TYPE filter,
-    int Lx, // filter size
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Lx);
-    float sum = 0.0f;
-
-    for (int jy = 0; jy <= hR+hL; jy++) {
-        sum += READ_IMAGE_Y * READ_FILTER_VAL(jy);
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-#endif
-
-#if IMAGE_DIMS == 3
-// Convolution with 1D kernel along axis "Z"
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z".
-__kernel void convol_1D_Z_tex(
-    read_only IMAGE_TYPE input,
-    __global float * output,
-    read_only FILTER_TYPE filter,
-    int Lx, // filter size
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Lx);
-    float sum = 0.0f;
-
-    for (int jz = 0; jz <= hR+hL; jz++) {
-        sum += READ_IMAGE_Z * READ_FILTER_VAL(jz);
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-#endif
-#endif
-
-/******************************************************************************/
-/**************************** 2D Convolution **********************************/
-/******************************************************************************/
-
-#if IMAGE_DIMS >= 2 && FILTER_DIMS == 2
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_XY_tex(
-    read_only IMAGE_TYPE input,
-    __global float * output,
-    read_only FILTER_TYPE filter,
-    int Lx, // filter number of columns,
-    int Ly, // filter number of rows,
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Lx);
-    float sum = 0.0f;
-
-    for (int jy = 0; jy <= hR+hL; jy++) {
-        for (int jx = 0; jx <= hR+hL; jx++) {
-            sum += READ_IMAGE_XY * READ_FILTER_VAL(jx, jy);
-        }
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-#endif
-
-#if IMAGE_DIMS == 3 && FILTER_DIMS == 2
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_XZ_tex(
-    read_only IMAGE_TYPE input,
-    __global float * output,
-    read_only FILTER_TYPE filter,
-    int Lx, // filter number of columns,
-    int Lz, // filter number of rows,
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Lx);
-    float sum = 0.0f;
-
-    for (int jz = 0; jz <= hR+hL; jz++) {
-        for (int jx = 0; jx <= hR+hL; jx++) {
-            sum += READ_IMAGE_XZ * READ_FILTER_VAL(jx, jz);
-        }
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_YZ_tex(
-    read_only IMAGE_TYPE input,
-    __global float * output,
-    read_only FILTER_TYPE filter,
-    int Lx, // filter number of columns,
-    int Lz, // filter number of rows,
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Lx);
-    float sum = 0.0f;
-
-    for (int jz = 0; jz <= hR+hL; jz++) {
-        for (int jy = 0; jy <= hR+hL; jy++) {
-            sum += READ_IMAGE_YZ * READ_FILTER_VAL(jy, jz);
-        }
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-#endif
-
-
-/******************************************************************************/
-/**************************** 3D Convolution **********************************/
-/******************************************************************************/
-
-#if IMAGE_DIMS == 3 && FILTER_DIMS == 3
-// Convolution with 3D kernel
-__kernel void convol_3D_XYZ_tex(
-    read_only IMAGE_TYPE input,
-    __global float * output,
-    read_only FILTER_TYPE filter,
-    int Lx, // filter number of columns,
-    int Ly, // filter number of rows,
-    int Lz, // filter number of rows,
-    int Nx, // input/output number of columns
-    int Ny, // input/output number of rows
-    int Nz  // input/output depth
-)
-{
-    uint gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    uint gidz = get_global_id(2);
-    if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
-    int c, hL, hR;
-    GET_CENTER_HL(Lx);
-    float sum = 0.0f;
-
-    for (int jz = 0; jz <= hR+hL; jz++) {
-        for (int jy = 0; jy <= hR+hL; jy++) {
-            for (int jx = 0; jx <= hR+hL; jx++) {
-                sum += READ_IMAGE_XYZ * READ_FILTER_VAL(jx, jy, jz);
-            }
-        }
-    }
-    output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-#endif
diff --git a/silx/resources/opencl/doubleword.cl b/silx/resources/opencl/doubleword.cl
deleted file mode 100644
index a0ebfda..0000000
--- a/silx/resources/opencl/doubleword.cl
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * OpenCL library for double word floating point calculation using compensated arithmetics
- *
- * The theoritical basis can be found in Valentina Popescu's PhD thesis:
- * Towards fast and certified multi-precision libraries
- * Reference LYSEN036
- * http://www.theses.fr/2017LYSEN036
- * All page number and equation number are refering to this document. 
- * 
- * The precision of the calculation (bounds) is provided in ULP (smallest possible mantissa) 
- * and come from the table 2.2 (page 68 of the thesis).
- * The number of equivalent FLOP is taken from the table 2.3 (page 69 the thesis). 
- * Note that FLOP are not all equal: a division is much more expensive than an addition.
- */
-
-//This library can be expanded to double-double by redefining fp, fp2 and one to double, double2 and 1.0.
-#ifdef DOUBLEDOUBLE
-#define fp double
-#define fp2 double2
-#define one 1.0
-#else
-#define fp float
-#define fp2 float2
-#define one 1.0f
-#endif
-
-/* Nota: i386 computer use x87 registers which are larger than the 32bits precision
- * which can invalidate the error compensation mechanism.
- *
- * We use the trick to declare some variable "volatile" to enforce the actual
- * precision reduction of those variables.
-*/
-
-#ifndef X87_VOLATILE
-# define X87_VOLATILE
-#endif
-
-//Algorithm 1, p23, theorem 1.1.12. Requires e_x > e_y, valid if |x| > |y| 
-inline fp2 fast_fp_plus_fp(fp x, fp y){
-    X87_VOLATILE fp s = x + y;
-    X87_VOLATILE fp z = s - x;
-    fp e = y - z;
-    return (fp2)(s, e);
-}
-
-//Algorithm 2, p24, same as fast_fp_plus_fp without the condition on e_x and e_y
-inline fp2 fp_plus_fp(fp x, fp y){
-    X87_VOLATILE fp s = x + y;
-    X87_VOLATILE fp xp = s - y;
-    X87_VOLATILE fp yp = s - xp;
-    X87_VOLATILE fp dx = x - xp;
-    X87_VOLATILE fp dy = y - yp;
-    return (fp2)(s, dx+dy);  
-}
-
-//Algorithm 3, p24: multiplication with a FMA
-inline fp2 fp_times_fp(fp x, fp y){
-    fp p = x * y;
-    fp e = fma(x, y, -p);
-    return (fp2)(p, e);  
-}
-
-//Algorithm 7, p38: Addition of a FP to a DW. 10flop bounds:2u²+5u³
-inline fp2 dw_plus_fp(fp2 x, fp y){
-    fp2 s = fp_plus_fp(x.s0, y);
-    X87_VOLATILE fp v = x.s1 + s.s1;
-    return fast_fp_plus_fp(s.s0, v);
-}
-
-//Algorithm 9, p40: addition of two DW: 20flop bounds:3u²+13u³
-inline fp2 dw_plus_dw(fp2 x, fp2 y){
-    fp2 s = fp_plus_fp(x.s0, y.s0);
-    fp2 t = fp_plus_fp(x.s1, y.s1);
-    fp2 v = fast_fp_plus_fp(s.s0, s.s1 + t.s0);
-    return fast_fp_plus_fp(v.s0, t.s1 + v.s1);
-}
-
-//Algorithm 12, p49: Multiplication FP*DW: 6flops bounds:2u²
-inline fp2 dw_times_fp(fp2 x, fp y){
-    fp2 c = fp_times_fp(x.s0, y);
-    return fast_fp_plus_fp(c.s0, fma(x.s1, y, c.s1));
-}
-
-//Algorithm 14, p52: Multiplication DW*DW, 8 flops bounds:6u²
-inline fp2 dw_times_dw(fp2 x, fp2 y){
-    fp2 c = fp_times_fp(x.s0, y.s0);
-    X87_VOLATILE fp l = fma(x.s1, y.s0, x.s0 * y.s1);
-    return fast_fp_plus_fp(c.s0, c.s1 + l);
-}
-
-//Algorithm 17, p55: Division DW / FP, 10flops bounds: 3.5u²
-inline fp2 dw_div_fp(fp2 x, fp y){
-    X87_VOLATILE fp th = x.s0 / y;
-    fp2 pi = fp_times_fp(th, y);
-    fp2 d = x - pi;
-    X87_VOLATILE fp delta = d.s0 + d.s1;
-    X87_VOLATILE fp tl = delta/y; 
-    return fast_fp_plus_fp(th, tl);
-}
-
-//Derived from algorithm 20, p64: Inversion 1/ DW, 22 flops
-inline fp2 inv_dw(fp2 y){
-    X87_VOLATILE fp th = one/y.s0;
-    X87_VOLATILE fp rh = fma(-y.s0, th, one);
-    X87_VOLATILE fp rl = -y.s1 * th;
-    fp2 e = fast_fp_plus_fp(rh, rl);
-    fp2 delta = dw_times_fp(e, th);
-    return dw_plus_fp(delta, th);
-}
-    
-//Algorithm 20, p64: Division DW / DW, 30 flops: bounds:9.8u²
-inline fp2 dw_div_dw(fp2 x, fp2 y){
-    return dw_times_dw(x, inv_dw(y));
-}
-
diff --git a/silx/resources/opencl/image/cast.cl b/silx/resources/opencl/image/cast.cl
deleted file mode 100644
index 9e23a82..0000000
--- a/silx/resources/opencl/image/cast.cl
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- *   Project: SILX: Alogorithms for image processing
- *
- *   Copyright (C) 2013-2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef NB_COLOR
-    #define NB_COLOR 1
-#endif
-
-
-/**
- * \brief Cast values of an array of uint8 into a float output array.
- *
- * :param array_input:   Pointer to global memory with the input data as unsigned8 array
- * :param array_float:   Pointer to global memory with the output data as float array
- * :param width:         Width of the image
- * :param height:        Height of the image
- */
-kernel void u8_to_float( global unsigned char  *array_input,
-                         global float *array_float,
-                         const int width,
-                         const int height)
-{
-    //Global memory guard for padding
-    if ((get_global_id(0) < width) && (get_global_id(1)<height))
-    {
-        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
-        for (int c=0; c<NB_COLOR; c++)
-        {
-            array_float[i + c] = (float) array_input[i + c];
-        } //end loop over colors
-    } //end test in image
-} //end kernel
-
-/**
- * \brief Cast values of an array of uint8 into a float output array.
- *
- * :param array_input:   Pointer to global memory with the input data as signed8 array
- * :param array_float:   Pointer to global memory with the output data as float array
- * :param width:         Width of the image
- * :param height:        Height of the image
- */
-kernel void s8_to_float( global char  *array_input,
-                         global float *array_float,
-                         const int width,
-                         const int height)
-{
-    //Global memory guard for padding
-    if ((get_global_id(0) < width) && (get_global_id(1)<height))
-    {
-        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
-        for (int c=0; c<NB_COLOR; c++)
-        {
-            array_float[i + c] = (float) array_input[i + c];
-        } //end loop over colors
-    } //end test in image
-} //end kernel
-
-
-/**
- * \brief cast values of an array of uint16 into a float output array.
- *
- * :param array_input:    Pointer to global memory with the input data as unsigned16 array
- * :param array_float:    Pointer to global memory with the output data as float array
- * :param width:          Width of the image
- * :param height:         Height of the image
- */
-kernel void u16_to_float(global unsigned short  *array_input,
-                         global float *array_float,
-                         const int width,
-                         const int height)
-{
-    //Global memory guard for padding
-    if ((get_global_id(0) < width) && (get_global_id(1)<height))
-    {
-        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
-        for (int c=0; c<NB_COLOR; c++)
-        {
-            array_float[i + c] = (float) array_input[i + c];
-        }           //end loop over colors
-    } //end test in image
-} //end kernel
-
-/**
- * \brief cast values of an array of int16 into a float output array.
- *
- * :param array_input:    Pointer to global memory with the input data as signed16 array
- * :param array_float:    Pointer to global memory with the output data as float array
- * :param width:          Width of the image
- * :param height:         Height of the image
- */
-kernel void s16_to_float(global short *array_input,
-                         global float *array_float,
-                         const int width,
-                         const int height)
-{
-    //Global memory guard for padding
-    if ((get_global_id(0) < width) && (get_global_id(1)<height))
-    {
-        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
-        for (int c=0; c<NB_COLOR; c++)
-        {
-            array_float[i + c] = (float) array_input[i + c];
-        } //end loop over colors
-    } //end test in image
-}//end kernel
-
-/**
- * \brief cast values of an array of uint32 into a float output array.
- *
- * :param array_input:    Pointer to global memory with the input data as unsigned32 array
- * :param array_float:    Pointer to global memory with the output data as float array
- * :param width:          Width of the image
- * :param height:         Height of the image
- */
-kernel void u32_to_float(global unsigned int  *array_input,
-                         global float *array_float,
-                         const int width,
-                         const int height)
-{
-    //Global memory guard for padding
-    if ((get_global_id(0) < width) && (get_global_id(1)<height))
-    {
-        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
-        for (int c=0; c<NB_COLOR; c++)
-        {
-            array_float[i + c] = (float) array_input[i + c];
-        } //end loop over colors
-    } //end test in image
-}//end kernel
-
-
-/**
- * \brief convert values of an array of int32 into a float output array.
- *
- * :param array_input:    Pointer to global memory with the data in int
- * :param array_float:    Pointer to global memory with the data in float
- * :param width:          Width of the image
- * :param height:         Height of the image
- */
-kernel void s32_to_float(global int  *array_input,
-                         global float  *array_float,
-                         const int width,
-                         const int height)
-{
-    //Global memory guard for padding
-    if ((get_global_id(0) < width) && (get_global_id(1)<height))
-    {
-        int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
-        for (int c=0; c<NB_COLOR; c++)
-        {
-            array_float[i + c] = (float) array_input[i + c];
-        } //end loop over colors
-    } //end test in image
-}//end kernel
-
diff --git a/silx/resources/opencl/image/histogram.cl b/silx/resources/opencl/image/histogram.cl
deleted file mode 100644
index 7fb1485..0000000
--- a/silx/resources/opencl/image/histogram.cl
+++ /dev/null
@@ -1,178 +0,0 @@
-//CL//
-
-/*
- *   Project: SILX: Alogorithms for image processing
- *
- *   Copyright (C) 2013-2018 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/* Single kernel histogram for float array.
- *
- * Can perform histograming in log-scale (using the arcsinh)
-
-Parameters:
-  - data: buffer with the image content in float (input)
-  - data_size: input
-  - mini: Lower bound of the first bin
-  - maxi: upper bouns of the last bin
-  - map_operation: Type of pre-processor to use. if if set to !=0, use log scale
-  - hist: resulting histogram (ouptut)
-  - hist_size: number of bins
-  - tmp_hist: temporary storage of size  hist_size*num_groups
-  - processed: temporary storage of size 1 initialially set to 0
-  - local_hist: local storage of size hist_size
-
-
-Grid information:
-    * use the largest WG size. If it is larger than the number of bins (hist_size),
-    take the power of 2 immediately above
-    *Schedule as many WG as the device has compute engines. No need to schuedule more,
-    it is just a waist of memory
-    * The histogram should fit in shared (local) memory and tmp_hist can be large!
-
-Assumes:
-    hist and local_hist have size hist_size
-    edges has size hist_size+2
-    tmp_hist has size hist_size*num_groups
-    processed is of size one and initially set to 0
-
-*/
-
-
-static float preprocess(float value, int code)
-{
-    //This function can be modified to have more scales
-    if (code!=0)
-    {
-        return asinh(value);
-    }
-    else
-    {
-        return value;
-    }
-}
-
-kernel void histogram(global float *data,
-                      int data_size,
-                      float mini,
-                      float maxi,
-                      int map_operation,
-                      global int *hist,
-                      global float *edges,
-                      int hist_size,
-                      global int *tmp_hist,
-                      global int *processed,
-                      local int *local_hist)
-{
-    // each thread
-    int lid = get_local_id(0);
-    int ws = get_local_size(0);
-    int nb_engine = get_num_groups(0);
-    int engine_id = get_group_id(0);
-
-    // memset the local_hist array
-    for (int i=0; i<hist_size; i+=ws)
-    {
-        int j = i + lid;
-        if (j<hist_size)
-        {
-            local_hist[j] = 0;
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Process the local data and accumulate in shared memory
-    int bloc_size = (int) ceil((float)data_size/(float)nb_engine);
-    int start = bloc_size * engine_id;
-    int stop = min(start + bloc_size, data_size);
-    float vmini = preprocess(mini, map_operation);
-    float vscale = (float)hist_size/(preprocess(maxi, map_operation) -vmini);
-    for (int i = start; i<stop; i+=ws)
-    {
-        int address = i + lid;
-        if (address < stop)
-        {
-            float value = data[address];
-            if ((!isnan(value)) && (value>=mini) && (value<=maxi))
-            {
-                float vvalue = (preprocess(value, map_operation)-vmini)*vscale;
-                int idx = clamp((int) vvalue, 0, hist_size - 1);
-                atomic_inc(&local_hist[idx]);
-            }
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    //Now copy result into the right place and reset the first value of the shared array
-    for (int i=0; i<hist_size; i+=ws)
-    {
-        int j = i + lid;
-        if (j<hist_size)
-        {
-            tmp_hist[hist_size * engine_id + j] = local_hist[j];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    local_hist[0] = 0;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    //Increment the system wide shared variable processed and share the result with the workgroup
-    if (lid == 0)
-    {
-        local_hist[0] = atomic_inc(processed);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // If we are the engine last to work, perform the concatenation of all results
-
-    if ((local_hist[0] + 1) == nb_engine)
-    {
-        for (int i=0; i<hist_size; i+=ws)
-        {
-            int j = i + lid;
-            int lsum = 0;
-            if (j<hist_size)
-            {
-                for (int k=0; k<nb_engine; k++)
-                {
-                    lsum += tmp_hist[hist_size * k + j];
-                }
-                hist[j] = lsum;
-                edges[j] = vmini + j/vscale;
-            }
-        }
-        // Finally reset the counter
-        if (lid == 0)
-        {
-            processed[0] = 0;
-            edges[hist_size] = vmini + hist_size/vscale;;
-        }
-
-    }
-}
diff --git a/silx/resources/opencl/image/map.cl b/silx/resources/opencl/image/map.cl
deleted file mode 100644
index 804a5a1..0000000
--- a/silx/resources/opencl/image/map.cl
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- *   Project: SILX: Data analysis library
- *            kernel for maximum and minimum calculation
- *
- *
- *   Copyright (C) 2013-2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- *
- */
-
-#ifndef NB_COLOR
-    #define NB_COLOR 1
-#endif
-
-
-/**
- * \brief Linear scale of signal in an image (maybe multi-color)
- *
- * :param image: contains the input image
- * :param output: contains the output image after scaling
- * :param max_min_input: 2-floats containing the maximum and the minimum value of image
- * :param minimum: float scalar with the minimum of the output
- * :param maximum: float scalar with the maximum of the output
- * :param width: integer, number of columns the matrices
- * :param height: integer, number of lines of the matrices
- *
- *
- */
-
-
-kernel void normalize_image(global float* image,
-                            global float* output,
-                            int width,
-                            int height,
-                            global float* max_min_input,
-                            float minimum,
-                            float maximum
-                            )
-{
-    //Global memory guard for padding
-    if((get_global_id(0) < width) && (get_global_id(1)<height))
-    {
-        int idx = NB_COLOR* (get_global_id(0) + width * get_global_id(1));
-        float mini_in, maxi_in, scale;
-        maxi_in = max_min_input[0];
-        mini_in = max_min_input[1];
-        if (maxi_in == mini_in)
-        {
-            scale = NAN;
-        }
-        else
-        {
-            scale = (maximum - minimum) / (maxi_in - mini_in);
-        }
-
-        for (int c=0; c<NB_COLOR; c++)
-        {
-            output[idx+c] = fma(scale, image[idx+c]-mini_in, minimum);
-        } // end color loop
-    }//end if in IMAGE
-}//end kernel
diff --git a/silx/resources/opencl/image/max_min.cl b/silx/resources/opencl/image/max_min.cl
deleted file mode 100644
index 246cd48..0000000
--- a/silx/resources/opencl/image/max_min.cl
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- *   Project: SILX: Data analysis library
- *            kernel for maximum and minimum calculation
- *
- *
- *   Copyright (C) 2013-2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- *
- */
-
-#ifndef NB_COLOR
-    #define NB_COLOR 1
-#endif
-
-
-
-#define REDUCE(a, b) ((float2) (fmax(a.x, b.x), fmin(a.y, b.y)))
-
-static float2 read_and_map(int idx,
-                           global float* data)
-{
-    idx *= NB_COLOR;
-    float tmp = data[idx];
-    float2 res = (float2) (tmp, tmp);
-    if (NB_COLOR > 1)
-    {
-        for (int c=1; c<NB_COLOR; c++)
-        {
-            tmp = data[idx + c];
-            res = (float2) (fmax(res.x, tmp), fmin(res.y, tmp));
-        }
-    }
-    return res;
-}
-
-
-/**
- * \brief max_min_global_stage1: Look for the maximum an the minimum of an array. stage1
- *
- * optimal workgroup size: 2^n greater than sqrt(size), limited to 512
- * optimal total item size:  (workgroup size)^2
- * if size >total item size: adjust seq_count.
- *
- * :param data:       Float pointer to global memory storing the vector of data.
- * :param out:        Float2 pointer to global memory storing the temporary results (workgroup size)
- * :param seq_count:  how many blocksize each thread should read
- * :param size:       size of the problem (number of element in the image
- * :param l_data      Shared memory: 2 float per thread in workgroup
- *
-**/
-
-
-kernel void max_min_reduction_stage1( global const float *data,
-                                      global float2 *out,
-                                      int size,
-                                      local  float2 *l_data)// local storage 2 float per thread
-{
-    int group_size =  get_local_size(0);
-    int lid = get_local_id(0);
-    float2 acc;
-    int big_block = group_size * get_num_groups(0);
-    int i =  lid + group_size * get_group_id(0);
-
-    if (lid<size)
-        acc = read_and_map(lid, data);
-    else
-        acc = read_and_map(0, data);
-
-    // Linear pre-reduction stage 0
-
-    while (i<size){
-      acc = REDUCE(acc, read_and_map(i, data));
-      i += big_block;
-    }
-
-    // parallel reduction stage 1
-
-    l_data[lid] = acc;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int block=group_size/2; block>1; block/=2)
-        {
-            if ((lid < block) && ((lid + block)<group_size)){
-                l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]);
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-    if (lid == 0)
-    {
-        if (group_size > 1)
-        {
-            acc = REDUCE(l_data[0], l_data[1]);
-        }
-        else
-        {
-            acc = l_data[0];
-        }
-        out[get_group_id(0)] = acc;
-    }
-}
-
-
-/**
- * \brief global_max_min: Look for the maximum an the minimum of an array.
- *
- *
- *
- * :param data2:      Float2 pointer to global memory storing the vector of pre-reduced data (workgroup size).
- * :param maximum:    Float pointer to global memory storing the maximum value
- * :param minumum:    Float pointer to global memory storing the minimum value
- * :param l_data      Shared memory: 2 float per thread in workgroup
- *
-**/
-
-kernel void max_min_reduction_stage2(
-        global const float2 *data2,
-        global float2 *maxmin,
-        local  float2 *l_data)// local storage 2 float per thread
-{
-    int lid = get_local_id(0);
-    int group_size =  get_local_size(0);
-    float2 acc = (float2)(-1.0f, -1.0f);
-    if (lid<=group_size)
-    {
-        l_data[lid] = data2[lid];
-    }
-    else
-    {
-        l_data[lid] = acc;
-    }
-
-    // parallel reduction stage 2
-
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int block=group_size/2; block>1; block/=2)
-    {
-        if ((lid < block) && ((lid + block)<group_size))
-        {
-            l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-    }
-
-    if (lid == 0  )
-    {
-        if ( group_size > 1)
-        {
-            acc = REDUCE(l_data[0], l_data[1]);
-        }
-        else
-        {
-            acc = l_data[0];
-        }
-        maxmin[0] = acc;
-    }
-}
-
-/*This is the serial version of the min_max kernel.
- *
- * It has to be launched with WG=1 and only 1 WG has to be launched !
- *
- * :param data:       Float pointer to global memory storing the vector of data.
- * :param size:       size of the
- * :param maximum:    Float pointer to global memory storing the maximum value
- * :param minumum:    Float pointer to global memory storing the minimum value
- *
- *
- */
-kernel void max_min_serial(
-                            global const float *data,
-                            unsigned int size,
-                            global float2 *maxmin
-                            )
-{
-    float2 acc = read_and_map(0, data);
-    for (int i=1; i<size; i++)
-    {
-        acc = REDUCE(acc, read_and_map(i, data));
-    }
-
-    maxmin[0] = acc;
-}
diff --git a/silx/resources/opencl/kahan.cl b/silx/resources/opencl/kahan.cl
deleted file mode 100644
index c23d84d..0000000
--- a/silx/resources/opencl/kahan.cl
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * OpenCL library for 32-bits floating point calculation using compensated arithmetics
- */
-
-/* Nota: i386 computer use x87 registers which are larger than the 32bits precision
- * which can invalidate the error compensation mechanism.
- *
- * We use the trick to declare some variable "volatile" to enforce the actual
- * precision reduction of those variables.
-*/
-
-#ifndef X87_VOLATILE
-# define X87_VOLATILE
-#endif
-
-// calculate acc.s0+value with error compensation
-// see https://en.wikipedia.org/wiki/Kahan_summation_algorithm
-// unlike wikipedia, here the exact value = acc.s0 + acc.s1 (performed in higher precision)
-
-static inline float2 kahan_sum(float2 acc, float value)
-{
-    if (value)
-    {
-        float sum = acc.s0;
-        float err = acc.s1;
-        if (fabs(value) > fabs(sum))
-        {
-            float tmp = sum;
-            sum = value;
-            value = tmp;
-        }
-
-        float cor = value + err;
-        X87_VOLATILE float target = sum + cor;
-        err = cor - (target - sum);
-        return (float2)(target, err);
-    }
-    else
-    {
-        return acc;
-    }
-}
-
-// calculate a*b with error compensation
-// see https://hal.archives-ouvertes.fr/hal-01367769/document
-static inline float2 comp_prod(float a, float b)
-{
-    X87_VOLATILE float x = a * b;
-    float y = fma(a, b, -x);
-    return (float2)(x, y);
-}
-
-// calculate a + b with error compensation
-static inline float2 compensated_sum(float2 a, float2 b)
-{
-    float err = a.s1 + b.s1;
-    float first = a.s0;
-    float second = b.s0;
-    if (fabs(second) > fabs(first))
-    {//swap first and second
-        float tmp = first;
-        first = second;
-        second = tmp;
-    }
-    float cor = second + err;
-    X87_VOLATILE float target = first + cor;
-    err = cor - (target - first);
-    return (float2)(target, err);
-}
-
-// calculate a * b with error compensation
-static inline float2 compensated_mul(float2 a, float2 b)
-{
-    float2 tmp;
-    tmp = kahan_sum((float2)(a.s0*b.s0, a.s0*b.s1), a.s1*b.s0);
-    tmp = kahan_sum(tmp, a.s1*b.s1);
-    return tmp;
-}
-
-// calculate 1/a  with error compensation (Needs validation)
-static inline float2 compensated_inv(float2 a)
-{
-    float main = a.s0;
-    float err  = a.s1;
-    return (float2)(1.0f/main, -err/(main*main));
-}
-
-// calculate a/b  with error compensation (Needs validation)
-static inline float2 compensated_div(float2 a, float2 b)
-{
-    float ah = a.s0;
-    float al  = a.s1;
-    float bh = b.s0;
-    float bl  = b.s1;
-    float bl_over_bh = bl/bh;
-    return kahan_sum(kahan_sum(a, -a.s1 * bl_over_bh), -a.s0 * bl_over_bh) / bh;
-}
-
-
-// calculate a.b where a and b are float2
-static inline float2 comp_dot2(float2 a, float2 b)
-{
-    return compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1));
-}
-
-// calculate a.b where a and b are float3
-static inline float2 comp_dot3(float3 a, float3 b)
-{
-    return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), comp_prod(a.s2, b.s2));
-}
-
-// calculate a.b where a and b are float4
-static inline float2 comp_dot4(float4 a, float4 b)
-{
-    return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
-                           compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3)));
-}
-
-// calculate a.b where a and b are float8
-static inline float2 comp_dot8(float8 a, float8 b)
-{
-    return compensated_sum(
-            compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
-                           compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))),
-            compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)),
-                           compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7))));
-}
-
-// calculate a.b where a and b are float8
-static inline float2 comp_dot16(float16 a, float16 b)
-{
-    return compensated_sum(
-             compensated_sum(
-               compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
-                               compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))),
-               compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)),
-                               compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7)))),
-             compensated_sum(
-               compensated_sum(compensated_sum(comp_prod(a.s8, b.s8), comp_prod(a.s9, b.s9)),
-                               compensated_sum(comp_prod(a.sa, b.sa), comp_prod(a.sb, b.sb))),
-               compensated_sum(compensated_sum(comp_prod(a.sc, b.sc), comp_prod(a.sd, b.sd)),
-                               compensated_sum(comp_prod(a.se, b.se), comp_prod(a.sf, b.sf)))));
-}
diff --git a/silx/resources/opencl/linalg.cl b/silx/resources/opencl/linalg.cl
deleted file mode 100644
index 8710528..0000000
--- a/silx/resources/opencl/linalg.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- *   Copyright (C) 2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- */
-
-/**
- *
- * Compute the spatial gradient of an image.
- *
- * slice: input image
- * slice_grad: output gradient
- * sizeX: number of columns of the image
- * sizeY: number of rows of the image
- *
- **/
-kernel void kern_gradient2D(
-    global float* slice,
-    global float2* slice_grad,
-    int sizeX,
-    int sizeY)
-{
-
-    int gidx = (int) get_global_id(0);
-    int gidy = (int) get_global_id(1);
-
-    if ((gidx < sizeX) && (gidy < sizeY))
-    {
-    	// Note the direction inconstancy ! (JK 07/2018)
-
-        float val_y = (gidx == (sizeX-1))? 0: slice[gidy*sizeX+gidx+1] - slice[gidy*sizeX+gidx];
-        float val_x = (gidy == (sizeY-1))? 0: slice[(gidy+1)*sizeX+gidx] - slice[(gidy)*sizeX+gidx];
-
-        slice_grad[gidy*sizeX+gidx].x = val_x;
-        slice_grad[gidy*sizeX+gidx].y = val_y;
-    }
-}
-
-/**
- *
- * Compute the spatial divergence of an image gradient.
- *
- * slice_grad: input gradient-like image
- * slice: output image
- * sizeX: number of columns of the input
- * sizeY: number of rows of the input
- *
- **/
-kernel void kern_divergence2D(
-    global float2* slice_grad,
-    global float* slice,
-    int sizeX,
-    int sizeY)
-{
-    int gidx = (int) get_global_id(0);
-    int gidy = (int) get_global_id(1);
-
-    if (gidx < sizeX && gidy < sizeY)
-    {
-        float val_x, val_y;
-        val_y = (gidx == 0)?
-        		slice_grad[(gidy)*sizeX+gidx].y :
-        		slice_grad[(gidy)*sizeX+gidx].y - slice_grad[(gidy)*sizeX+gidx-1].y;
-        val_x = (gidy == 0)?
-        		slice_grad[(gidy)*sizeX+gidx].x:
-                slice_grad[(gidy)*sizeX+gidx].x - slice_grad[(gidy-1)*sizeX+gidx].x;
-        slice[gidy*sizeX+gidx] = val_x + val_y;
-    }
-}
diff --git a/silx/resources/opencl/medfilt.cl b/silx/resources/opencl/medfilt.cl
deleted file mode 100644
index 0680230..0000000
--- a/silx/resources/opencl/medfilt.cl
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- *   Project: Azimuthal regroupping OpenCL kernel for PyFAI.
- *            Median filter for 1D, 2D and 3D datasets, only 2D for now
- *
- *
- *   Copyright (C) 2017-2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: J. Kieffer (kieffer@esrf.fr)
- *   Last revision: 07/02/2017
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- */
-
-/*
- * Needs to be concatenated with bitonic.cl prior to compilation
-*/
-
-/*
- *  Perform the 2D median filtering of an image 2D image
- *
- * dim0 => wg=number_of_element in the tile /8
- * dim1 = x: wg=1
-
- *
- * Actually the workgoup size is a bit more complicated:
- * if window size = 1,3,5,7: WG1=8
- * if window size = 9,11,13,15: WG1=32
- * if window size = 17, ...,21: WG1=64
- *
- * More Generally the workgroup size must be: at least: kfs2*(kfs1+7)/8
- * Each thread treats 8 values aligned vertically, this allows (almost)
- * coalesced reading between threads in one line of the tile.
- *
- * Later on it will be more efficient to re-use the same tile
- * and slide it vertically by one line.
- * The additionnal need for shared memory will be kfs2 floats and a float8 as register.
- *
- * Theoritically, it should be possible to handle up to windows-size 83x83
- */
-__kernel void medfilt2d(__global float *image,  // input image
-                        __global float *result, // output array
-                        __local  float4 *l_data,// local storage 4x the number of threads
-                                 int khs1,      // Kernel half-size along dim1 (nb lines)
-                                 int khs2,      // Kernel half-size along dim2 (nb columns)
-                                 int height,    // Image size along dim1 (nb lines)
-                                 int width)     // Image size along dim2 (nb columns)
-{
-    int threadid = get_local_id(0);
-    //int wg = get_local_size(0);
-    int x = get_global_id(1);
-
-    if (x < width)
-    {
-        union
-        {
-            float  ary[8];
-            float8 vec;
-        } output, input;
-        input.vec = (float8)(MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT);
-        int kfs1 = 2 * khs1 + 1; //definition of kernel full size
-        int kfs2 = 2 * khs2 + 1;
-        int nbands = (kfs1 + 7) / 8; // 8 elements per thread, aligned vertically in 1 column
-        for (int y=0; y<height; y++)
-        {
-            //Select only the active threads, some may remain inactive
-            int nb_threads =  (nbands * kfs2);
-            int band_nr = threadid / kfs2;
-            int band_id = threadid % kfs2;
-            int pos_x = clamp((int)(x + band_id - khs2), (int) 0, (int) width-1);
-            int max_vec = clamp(kfs1 - 8 * band_nr, 0, 8);
-            if (y == 0)
-            {
-                for (int i=0; i<max_vec; i++)
-                {
-                    if (threadid<nb_threads)
-                    {
-                        int pos_y = clamp((int)(y + 8 * band_nr + i - khs1), (int) 0, (int) height-1);
-                        input.ary[i] = image[pos_x + width * pos_y];
-                    }
-                }
-            }
-            else
-            {
-                //store storage.s0 to some shared memory to retrieve it from another thread.
-                l_data[threadid].s0 = input.vec.s0;
-
-                //Offset to the bottom
-                input.vec = (float8)(input.vec.s1,
-                        input.vec.s2,
-                        input.vec.s3,
-                        input.vec.s4,
-                        input.vec.s5,
-                        input.vec.s6,
-                        input.vec.s7,
-                        MAXFLOAT);
-
-                barrier(CLK_LOCAL_MEM_FENCE);
-
-                int read_from = threadid + kfs2;
-                if (read_from < nb_threads)
-                    input.vec.s7 = l_data[read_from].s0;
-                else if (threadid < nb_threads) //we are on the last band
-                {
-                    int pos_y = clamp((int)(y + 8 * band_nr + max_vec - 1 - khs1), (int) 0, (int) height-1);
-                    input.ary[max_vec - 1] = image[pos_x + width * pos_y];
-                }
-
-            }
-
-            //This function is defined in bitonic.cl
-            output.vec = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
-                                       input.vec, l_data);
-
-            size_t target = (kfs1 * kfs2) / 2;
-            if (threadid == (target / 8))
-            {
-                result[y * width + x] = output.ary[target % 8];
-            }
-
-        }
-    }
-}
-
diff --git a/silx/resources/opencl/preprocess.cl b/silx/resources/opencl/preprocess.cl
deleted file mode 100644
index de35c48..0000000
--- a/silx/resources/opencl/preprocess.cl
+++ /dev/null
@@ -1,567 +0,0 @@
-/*
- *   Project: Azimuthal regroupping OpenCL kernel for PyFAI.
- *            Preprocessing program
- *
- *
- *   Copyright (C) 2012-2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: J. Kieffer (kieffer@esrf.fr)
- *   Last revision: 19/01/2017
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-/**
- * \file
- *
- * \brief OpenCL kernels for image array casting, array mem-setting and normalizing
- *
- * Constant to be provided at build time:
- *   NIMAGE: size of the image
- *
- */
-
-#include "for_eclipse.h"
-
-/**
- * \brief cast values of an array of int8 into a float output array.
- *
- * - array_s8: Pointer to global memory with the input data as signed8 array
- * - array_float:  Pointer to global memory with the output data as float array
- */
-__kernel void
-s8_to_float(__global char  *array_s8,
-            __global float *array_float
-)
-{
-  int i = get_global_id(0);
-  //Global memory guard for padding
-  if (i < NIMAGE)
-    array_float[i] = (float)array_s8[i];
-}
-
-
-/**
- * \brief cast values of an array of uint8 into a float output array.
- *
- * - array_u8: Pointer to global memory with the input data as unsigned8 array
- * - array_float:  Pointer to global memory with the output data as float array
- */
-__kernel void
-u8_to_float(__global unsigned char  *array_u8,
-            __global float *array_float
-)
-{
-  int i = get_global_id(0);
-  //Global memory guard for padding
-  if (i < NIMAGE)
-    array_float[i] = (float)array_u8[i];
-}
-
-
-/**
- * \brief cast values of an array of int16 into a float output array.
- *
- * - array_s16: Pointer to global memory with the input data as signed16 array
- * - array_float:  Pointer to global memory with the output data as float array
- */
-__kernel void
-s16_to_float(__global short *array_s16,
-             __global float *array_float
-)
-{
-  int i = get_global_id(0);
-  //Global memory guard for padding
-  if (i < NIMAGE)
-    array_float[i] = (float)array_s16[i];
-}
-
-
-/**
- * \brief cast values of an array of uint16 into a float output array.
- *
- * - array_u16: Pointer to global memory with the input data as unsigned16 array
- * - array_float:  Pointer to global memory with the output data as float array
- */
-__kernel void
-u16_to_float(__global unsigned short  *array_u16,
-             __global float *array_float
-)
-{
-  int i = get_global_id(0);
-  //Global memory guard for padding
-  if (i < NIMAGE)
-    array_float[i] = (float)array_u16[i];
-}
-
-/**
- * \brief cast values of an array of uint32 into a float output array.
- *
- * - array_u32: Pointer to global memory with the input data as unsigned32 array
- * - array_float:  Pointer to global memory with the output data as float array
- */
-__kernel void
-u32_to_float(__global unsigned int  *array_u32,
-             __global float *array_float
-)
-{
-  int i = get_global_id(0);
-  //Global memory guard for padding
-  if (i < NIMAGE)
-    array_float[i] = (float)array_u32[i];
-}
-
-/**
- * \brief convert values of an array of int32 into a float output array.
- *
- * - array_int:  Pointer to global memory with the data as unsigned32 array
- * - array_float:  Pointer to global memory with the data float array
- */
-__kernel void
-s32_to_float(__global int  *array_int,
-             __global float  *array_float
-)
-{
-  int i = get_global_id(0);
-  //Global memory guard for padding
-  if (i < NIMAGE)
-    array_float[i] = (float)(array_int[i]);
-}
-
-
-/**
- * Functions to be called from an actual kernel.
- * \brief Performs the normalization of input image by dark subtraction,
- *        variance is propagated to second member of the float3
- *        flatfield, solid angle, polarization and absorption are stored in
- *        third member of the float3 returned.
- *
- * Invalid/Dummy pixels will all have the third-member set to 0, i.e. no weight
- *
- * - image           Float pointer to global memory storing the input image.
- * - do_dark         Bool/int: shall dark-current correction be applied ?
- * - dark            Float pointer to global memory storing the dark image.
- * - do_flat         Bool/int: shall flat-field correction be applied ?
- * - flat            Float pointer to global memory storing the flat image.
- * - do_solidangle   Bool/int: shall flat-field correction be applied ?
- * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
- * - do_polarization Bool/int: shall polarization correction be applied ?
- * - polarization    Float pointer to global memory storing the polarization of each pixel.
- * - do_absorption   Bool/int: shall absorption correction be applied ?
- * - absorption      Float pointer to global memory storing the effective absoption of each pixel.
- * - do_mask         perform mask correction ?
- * - mask            Bool/char pointer to mask array
- * - do_dummy        Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
- * - dummy           Float: value for bad pixels
- * - delta_dummy     Float: precision for bad pixel value
- * - normalization_factor : divide the input by this value
- *
-**/
-
-static float3 _preproc3(const __global float  *image,
-                        const __global float  *variance,
-                        const          char   do_dark,
-                        const __global float  *dark,
-                        const          char   do_dark_variance,
-                        const __global float  *dark_variance,
-                        const          char   do_flat,
-                        const __global float  *flat,
-                        const          char   do_solidangle,
-                        const __global float  *solidangle,
-                        const          char   do_polarization,
-                        const __global float  *polarization,
-                        const          char   do_absorption,
-                        const __global float  *absorption,
-                        const          char   do_mask,
-                        const __global char   *mask,
-                        const          char   do_dummy,
-                        const          float  dummy,
-                        const          float  delta_dummy,
-                        const          float  normalization_factor)
-{
-    size_t i= get_global_id(0);
-    float3 result = (float3)(0.0, 0.0, 0.0);
-    if (i < NIMAGE)
-    {
-        if ((!do_mask) || (!mask[i]))
-        {
-            result.s0 = image[i];
-            if (variance != 0)
-                result.s1 = variance[i];
-            result.s2 = normalization_factor;
-            if ( (!do_dummy)
-                  ||((delta_dummy != 0.0f) && (fabs(result.s0-dummy) > delta_dummy))
-                  ||((delta_dummy == 0.0f) && (result.s0 != dummy)))
-            {
-                if (do_dark)
-                    result.s0 -= dark[i];
-                if (do_dark_variance)
-                    result.s1 += dark_variance[i];
-                if (do_flat)
-                {
-                    float one_flat = flat[i];
-                    if ( (!do_dummy)
-                         ||((delta_dummy != 0.0f) && (fabs(one_flat-dummy) > delta_dummy))
-                         ||((delta_dummy == 0.0f) && (one_flat != dummy)))
-                        result.s2 *= one_flat;
-                    else
-                        result.s2 = 0.0f;
-                }
-                if (do_solidangle)
-                    result.s2 *= solidangle[i];
-                if (do_polarization)
-                    result.s2 *= polarization[i];
-                if (do_absorption)
-                    result.s2 *= absorption[i];
-                if (isnan(result.s0) || isnan(result.s1) || isnan(result.s2) || (result.s2 == 0.0f))
-                    result = (float3)(0.0, 0.0, 0.0);
-            }
-            else
-            {
-                result = (float3)(0.0, 0.0, 0.0);
-            }//end if do_dummy
-        } // end if mask
-    };//end if NIMAGE
-    return result;
-};//end function
-
-
-/**
- * \brief Performs the normalization of input image by dark subtraction,
- *        flatfield, solid angle, polarization and absorption division.
- *
- * Intensities of images are corrected by:
- *  - dark (read-out) noise subtraction
- *  - Solid angle correction (division)
- *  - polarization correction (division)
- *  - flat fiels correction (division)
- * Corrections are made in place unless the pixel is dummy.
- * Dummy pixels are left untouched so that they remain dummy
- *
- * - image           Float pointer to global memory storing the input image.
- * - do_dark         Bool/int: shall dark-current correction be applied ?
- * - dark            Float pointer to global memory storing the dark image.
- * - do_flat         Bool/int: shall flat-field correction be applied ?
- * - flat            Float pointer to global memory storing the flat image.
- * - do_solidangle   Bool/int: shall flat-field correction be applied ?
- * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
- * - do_polarization Bool/int: shall polarization correction be applied ?
- * - polarization    Float pointer to global memory storing the polarization of each pixel.
- * - do_absorption   Bool/int: shall absorption correction be applied ?
- * - absorption      Float pointer to global memory storing the effective absoption of each pixel.
- * - do_mask         perform mask correction ?
- * - mask            Bool/char pointer to mask array
- * - do_dummy        Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
- * - dummy           Float: value for bad pixels
- * - delta_dummy     Float: precision for bad pixel value
- * - normalization_factor : divide the input by this value
- *
-**/
-
-__kernel void
-corrections(const __global float  *image,
-            const          char   do_dark,
-            const __global float  *dark,
-            const          char   do_flat,
-            const __global float  *flat,
-            const          char   do_solidangle,
-            const __global float  *solidangle,
-            const          char   do_polarization,
-            const __global float  *polarization,
-			const          char   do_absorption,
-			const __global float  *absorption,
-            const          char   do_mask,
-            const __global char   *mask,
-            const          char   do_dummy,
-            const          float  dummy,
-            const          float  delta_dummy,
-            const          float  normalization_factor,
-                  __global float  *output
-            )
-{
-    size_t i= get_global_id(0);
-    float3 result = (float3)(0.0, 0.0, 0.0);
-    if (i < NIMAGE)
-    {
-        result = _preproc3(image,
-                            0,
-                            do_dark,
-                            dark,
-                            0,
-                            0,
-                            do_flat,
-                            flat,
-                            do_solidangle,
-                            solidangle,
-                            do_polarization,
-                            polarization,
-                            do_absorption,
-                            absorption,
-                            do_mask,
-                            mask,
-                            do_dummy,
-                            dummy,
-                            delta_dummy,
-                            normalization_factor);
-        if (result.s2 != 0.0f)
-            output[i] = result.s0 / result.s2;
-        else
-            output[i] = dummy;
-    };//end if NIMAGE
-
-};//end kernel
-
-
-/**
- * \brief Performs Normalization of input image with float2 output (num,denom)
- *
- * Intensities of images are corrected by:
- *  - dark (read-out) noise subtraction for the data
- *  - Solid angle correction (denominator)
- *  - polarization correction (denominator)
- *  - flat fiels correction (denominator)
- *
- * Corrections are made out of place.
- * Dummy pixels set both the numerator and denominator to 0
- *
- * - image           Float pointer to global memory storing the input image.
- * - do_dark         Bool/int: shall dark-current correction be applied ?
- * - dark            Float pointer to global memory storing the dark image.
- * - do_flat         Bool/int: shall flat-field correction be applied ?
- * - flat            Float pointer to global memory storing the flat image.
- * - do_solidangle   Bool/int: shall flat-field correction be applied ?
- * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
- * - do_polarization Bool/int: shall flat-field correction be applied ?
- * - polarization    Float pointer to global memory storing the polarization of each pixel.
- * - do_dummy          Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
- * - dummy             Float: value for bad pixels
- * - delta_dummy       Float: precision for bad pixel value
- * - normalization_factor : divide the input by this value
- *
- *
-**/
-__kernel void
-corrections2(const __global float  *image,
-             const          char   do_dark,
-             const __global float  *dark,
-             const          char   do_flat,
-             const __global float  *flat,
-             const          char   do_solidangle,
-             const __global float  *solidangle,
-             const          char   do_polarization,
-             const __global float  *polarization,
-             const          char   do_absorption,
-             const __global float  *absorption,
-             const          char   do_mask,
-             const __global char   *mask,
-             const          char   do_dummy,
-             const          float  dummy,
-             const          float  delta_dummy,
-             const          float  normalization_factor,
-                   __global float2  *output
-            )
-{
-    size_t i= get_global_id(0);
-    float3 result = (float3)(0.0, 0.0, 0.0);
-    if (i < NIMAGE)
-    {
-        result = _preproc3(image,
-                            0,
-                            do_dark,
-                            dark,
-                            0,
-                            0,
-                            do_flat,
-                            flat,
-                            do_solidangle,
-                            solidangle,
-                            do_polarization,
-                            polarization,
-                            do_absorption,
-                            absorption,
-                            do_mask,
-                            mask,
-                            do_dummy,
-                            dummy,
-                            delta_dummy,
-                            normalization_factor);
-        output[i] = (float2)(result.s0, result.s2);
-    };//end if NIMAGE
-};//end kernel
-
-/**
- * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) assuming poissonian signal
- *
- * Intensities of images are corrected by:
- *  - dark (read-out) noise subtraction for the data
- *  - Solid angle correction (denominator)
- *  - polarization correction (denominator)
- *  - flat fiels correction (denominator)
- *
- * Corrections are made out of place.
- * Dummy pixels set both the numerator and denominator to 0
- *
- * - image           Float pointer to global memory storing the input image.
- * - do_dark         Bool/int: shall dark-current correction be applied ?
- * - dark            Float pointer to global memory storing the dark image.
- * - do_flat         Bool/int: shall flat-field correction be applied ?
- * - flat            Float pointer to global memory storing the flat image.
- * - do_solidangle   Bool/int: shall flat-field correction be applied ?
- * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
- * - do_polarization Bool/int: shall flat-field correction be applied ?
- * - polarization    Float pointer to global memory storing the polarization of each pixel.
- * - do_dummy          Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
- * - dummy             Float: value for bad pixels
- * - delta_dummy       Float: precision for bad pixel value
- * - normalization_factor : divide the input by this value
- *
- *
-**/
-__kernel void
-corrections3Poisson( const __global float  *image,
-                     const          char   do_dark,
-                     const __global float  *dark,
-                     const          char   do_flat,
-                     const __global float  *flat,
-                     const          char   do_solidangle,
-                     const __global float  *solidangle,
-                     const          char   do_polarization,
-                     const __global float  *polarization,
-                     const          char   do_absorption,
-                     const __global float  *absorption,
-                     const          char   do_mask,
-                     const __global char   *mask,
-                     const          char   do_dummy,
-                     const          float  dummy,
-                     const          float  delta_dummy,
-                     const          float  normalization_factor,
-                           __global float3  *output
-            )
-{
-    size_t i= get_global_id(0);
-    float3 result = (float3)(0.0, 0.0, 0.0);
-    if (i < NIMAGE)
-    {
-        result = _preproc3(image,
-                           image,
-                           do_dark,
-                           dark,
-                           do_dark,
-                           dark,
-                           do_flat,
-                           flat,
-                           do_solidangle,
-                           solidangle,
-                           do_polarization,
-                           polarization,
-                           do_absorption,
-                           absorption,
-                           do_mask,
-                           mask,
-                           do_dummy,
-                           dummy,
-                           delta_dummy,
-                           normalization_factor);
-        output[i] = result;
-    };//end if NIMAGE
-};//end kernel
-
-
-/**
- * \brief Performs Normalization of input image with float3 output (signal, variance, normalization)
- *
- * Intensities of images are corrected by:
- *  - dark (read-out) noise subtraction for the data
- *  - Solid angle correction (division)
- *  - polarization correction (division)
- *  - flat fiels correction (division)
- * Corrections are made in place unless the pixel is dummy.
- * Dummy pixels are left untouched so that they remain dummy
- *
- * - image              Float pointer to global memory storing the input image.
- * - do_dark         Bool/int: shall dark-current correction be applied ?
- * - dark            Float pointer to global memory storing the dark image.
- * - do_flat         Bool/int: shall flat-field correction be applied ?
- * - flat            Float pointer to global memory storing the flat image.
- * - do_solidangle   Bool/int: shall flat-field correction be applied ?
- * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
- * - do_polarization Bool/int: shall flat-field correction be applied ?
- * - polarization    Float pointer to global memory storing the polarization of each pixel.
- * - do_dummy          Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
- * - dummy             Float: value for bad pixels
- * - delta_dummy       Float: precision for bad pixel value
- * - normalization_factor : divide the input by this value
- *
- *
-**/
-
-__kernel void
-corrections3(const __global float  *image,
-             const __global float  *variance,
-             const          char   do_dark,
-             const __global float  *dark,
-             const          char   do_dark_variance,
-             const __global float  *dark_variance,
-             const          char   do_flat,
-             const __global float  *flat,
-             const          char   do_solidangle,
-             const __global float  *solidangle,
-             const          char   do_polarization,
-             const __global float  *polarization,
-             const          char   do_absorption,
-             const __global float  *absorption,
-             const          char   do_mask,
-             const __global char   *mask,
-             const          char   do_dummy,
-             const          float  dummy,
-             const          float  delta_dummy,
-             const          float  normalization_factor,
-                   __global float3  *output
-            )
-{
-    size_t i= get_global_id(0);
-    float3 result = (float3)(0.0, 0.0, 0.0);
-    if (i < NIMAGE)
-    {
-        result = _preproc3( image,
-                            variance,
-                            do_dark,
-                            dark,
-                            do_dark_variance,
-                            dark_variance,
-                            do_flat,
-                            flat,
-                            do_solidangle,
-                            solidangle,
-                            do_polarization,
-                            polarization,
-                            do_absorption,
-                            absorption,
-                            do_mask,
-                            mask,
-                            do_dummy,
-                            dummy,
-                            delta_dummy,
-                            normalization_factor);
-        output[i] = result;
-    };//end if NIMAGE
-};//end kernel
-
-
diff --git a/silx/resources/opencl/proj.cl b/silx/resources/opencl/proj.cl
deleted file mode 100644
index 2a6d870..0000000
--- a/silx/resources/opencl/proj.cl
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- *   Copyright (C) 2017-2017 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- * Based on the projector of PyHST2 - https://forge.epn-campus.eu/projects/pyhst2
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- */
-
-/*******************************************************************************/
-/************************ GPU VERSION (with textures) **************************/
-/*******************************************************************************/
-
-#ifndef DONT_USE_TEXTURES
-kernel void  forward_kernel(
-        global float *d_Sino,
-        read_only image2d_t d_slice,
-        int dimslice,
-        int num_bins,
-        global float* angles_per_project ,
-        float axis_position,
-        global float *d_axis_corrections,
-        global int* d_beginPos    ,
-        global int* d_strideJoseph,
-        global int* d_strideLine  ,
-        int num_projections,
-        int  dimrecx,
-        int  dimrecy,
-        float cpu_offset_x,
-        float cpu_offset_y,
-        int josephnoclip,
-        int normalize)
-{
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
-    const int tidx = get_local_id(0);
-    const int bidx = get_group_id(0);
-    const int tidy = get_local_id(1);
-    const int bidy = get_group_id(1);
-    float angle;
-    float cos_angle,sin_angle ;
-
-    local float corrections[16];
-    local int beginPos[16*2];
-    local int strideJoseph[16*2];
-    local int strideLine[16*2];
-
-    // thread will use corrections[tidy]
-    // All are read by first warp
-    int offset, OFFSET;
-    switch(tidy) {
-    case 0:
-        corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx];
-        break;
-    case 1:
-    case 2:
-        offset = 16*(tidy-1);
-        OFFSET = dimrecy*(tidy-1);
-        beginPos    [offset + tidx ]=  d_beginPos[ OFFSET+ bidy*16+tidx]  ;
-        break;
-    case 3:
-    case 4:
-        offset = 16*(tidy-3);
-        OFFSET = dimrecy*(tidy-3);
-        strideJoseph[offset + tidx ]=  d_strideJoseph[OFFSET + bidy*16+tidx]  ;
-        break;
-    case 5:
-    case 6:
-        offset = 16*(tidy-5);
-        OFFSET = dimrecy*(tidy-5);
-        strideLine[offset + tidx ]=  d_strideLine[OFFSET + bidy*16+tidx]  ;
-        break;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    angle = angles_per_project[ bidy*16+tidy ]   ;
-    cos_angle = cos(angle);
-    sin_angle = sin(angle);
-
-    if(fabs(cos_angle) > 0.70710678f ) {
-        if( cos_angle>0) {
-            cos_angle = cos(angle);
-            sin_angle = sin(angle);
-        }
-        else {
-            cos_angle = -cos(angle);
-            sin_angle = -sin(angle);
-        }
-    }
-    else {
-        if( sin_angle>0) {
-            cos_angle =  sin(angle);
-            sin_angle = -cos(angle);
-        }
-        else {
-            cos_angle = -sin(angle);
-            sin_angle =  cos(angle);
-        }
-    }
-    float res=0.0f;
-    float axis_corr = axis_position  + corrections[ tidy ];
-    float axis      = axis_position ;
-    float xpix = ( bidx*16+tidx )-cpu_offset_x;
-    float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ;
-
-    float shiftJ = sin_angle/cos_angle;
-    float x1 = min(-sin_angle/cos_angle ,0.f);
-    float x2 = max(-sin_angle/cos_angle ,0.f);
-
-    float  Area;
-    Area=1.0f/cos_angle;
-    int stlA, stlB , stlAJ, stlBJ ;
-    stlA=strideLine[16+tidy];
-    stlB=strideLine[tidy];
-    stlAJ=strideJoseph[16+tidy];
-    stlBJ=strideJoseph[tidy];
-
-    int beginA = beginPos[16+tidy ];
-    int beginB = beginPos[tidy ];
-    float add;
-    int l;
-
-    if(josephnoclip) {
-        for(int j=0; j<dimslice; j++) {  // j: Joseph
-            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
-            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
-            add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2);
-            res += add;
-            posx += shiftJ;
-        }
-    }
-    else {
-        for(int j=0; j<dimslice; j++) {  // j: Joseph
-            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
-            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
-            l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ;
-            add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2);
-            res += add*l;
-            posx += shiftJ;
-        }
-    }
-
-    if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) {
-        res *= Area;
-        if (normalize)
-            res *= M_PI_F * 0.5f / num_projections;
-        d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res;
-    }
-}
-#endif
-
-
-/*******************************************************************************/
-/********************* CPU VERSION (without textures) **************************/
-/*******************************************************************************/
-
-
-kernel void  forward_kernel_cpu(
-        global float *d_Sino,
-        global float* d_slice,
-        int dimslice,
-        int num_bins,
-        global float* angles_per_project ,
-        float axis_position,
-        global float *d_axis_corrections,
-        global int* d_beginPos    ,
-        global int* d_strideJoseph,
-        global int* d_strideLine  ,
-        int num_projections,
-        int  dimrecx,
-        int  dimrecy,
-        float cpu_offset_x,
-        float cpu_offset_y,
-        int josephnoclip,
-        int normalize)
-{
-
-    const int tidx = get_local_id(0);
-    const int bidx = get_group_id(0);
-    const int tidy = get_local_id(1);
-    const int bidy = get_group_id(1);
-    float angle;
-    float cos_angle,sin_angle ;
-
-    local float corrections[16];
-    local int beginPos[16*2];
-    local int strideJoseph[16*2];
-    local int strideLine[16*2];
-
-    // thread will use corrections[tidy]
-    // All are read by first warp
-    int offset, OFFSET;
-    switch(tidy) {
-    case 0:
-        corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx];
-        break;
-    case 1:
-    case 2:
-        offset = 16*(tidy-1);
-        OFFSET = dimrecy*(tidy-1);
-        beginPos    [offset + tidx ]=  d_beginPos[ OFFSET+ bidy*16+tidx]  ;
-        break;
-    case 3:
-    case 4:
-        offset = 16*(tidy-3);
-        OFFSET = dimrecy*(tidy-3);
-        strideJoseph[offset + tidx ]=  d_strideJoseph[OFFSET + bidy*16+tidx]  ;
-        break;
-    case 5:
-    case 6:
-        offset = 16*(tidy-5);
-        OFFSET = dimrecy*(tidy-5);
-        strideLine[offset + tidx ]=  d_strideLine[OFFSET + bidy*16+tidx]  ;
-        break;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    angle = angles_per_project[ bidy*16+tidy ]   ;
-    cos_angle = cos(angle);
-    sin_angle = sin(angle);
-
-    if(fabs(cos_angle) > 0.70710678f ) {
-        if( cos_angle>0) {
-            cos_angle = cos(angle);
-            sin_angle = sin(angle);
-        }
-        else {
-            cos_angle = -cos(angle);
-            sin_angle = -sin(angle);
-        }
-    }
-    else {
-        if( sin_angle>0) {
-            cos_angle =  sin(angle);
-            sin_angle = -cos(angle);
-        }
-        else {
-            cos_angle = -sin(angle);
-            sin_angle =  cos(angle);
-        }
-    }
-    float res=0.0f;
-    float axis_corr = axis_position  + corrections[ tidy ];
-    float axis      = axis_position ;
-    float xpix = ( bidx*16+tidx )-cpu_offset_x;
-    float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ;
-
-    float shiftJ = sin_angle/cos_angle;
-    float x1 = min(-sin_angle/cos_angle ,0.f);
-    float x2 = max(-sin_angle/cos_angle ,0.f);
-
-    float  Area;
-    Area=1.0f/cos_angle;
-    int stlA, stlB , stlAJ, stlBJ ;
-    stlA=strideLine[16+tidy];
-    stlB=strideLine[tidy];
-    stlAJ=strideJoseph[16+tidy];
-    stlBJ=strideJoseph[tidy];
-
-    int beginA = beginPos[16+tidy ];
-    int beginB = beginPos[tidy ];
-    int l;
-
-    int ym, yp, xm, xp;
-    float yc, xc;
-    float val;
-    if(josephnoclip) {
-        for(int j=0; j<dimslice; j++) {  // j: Joseph
-            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.0f;
-            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.0f;
-            /*
-              Bilinear interpolation
-            */
-            yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped
-            ym = (int) floor(yc); // y_minus
-            yp = (int) ceil(yc);  // y_plus
-
-            xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f));  // x_clipped
-            xm = (int) floor(xc); // x_minus
-            xp = (int) ceil(xc);  // x_plus
-
-            if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm];
-            else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm));
-            else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym));
-            else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc))
-                       + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc))
-                       + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm))
-                       + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm));
-            // ----------
-            res += val;
-            posx += shiftJ;
-        }
-    }
-    else {
-        for(int j=0; j<dimslice; j++) {  // j: Joseph
-            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
-            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
-            l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ;
-            /*
-              Bilinear interpolation
-            */
-            yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped
-            ym = (int) floor(yc); // y_minus
-            yp = (int) ceil(yc);  // y_plus
-
-            xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f));  // x_clipped
-            xm = (int) floor(xc); // x_minus
-            xp = (int) ceil(xc);  // x_plus
-
-            if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm];
-            else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm));
-            else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym));
-            else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc))
-                       + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc))
-                       + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm))
-                       + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm));
-            // ----------
-            res += val*l;
-            posx += shiftJ;
-        }
-    }
-
-    if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) {
-        res *= Area;
-        if (normalize)
-            res *= M_PI_F * 0.5f / num_projections;
-        d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res;
-    }
-}
diff --git a/silx/resources/opencl/sparse.cl b/silx/resources/opencl/sparse.cl
deleted file mode 100644
index c99a0e9..0000000
--- a/silx/resources/opencl/sparse.cl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *   Copyright (C) 2016-2019 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef IMAGE_WIDTH
-  #error "Please define IMAGE_WIDTH parameter"
-#endif
-
-#ifndef DTYPE
-  #error "Please define DTYPE parameter"
-#endif
-
-#ifndef IDX_DTYPE
-  #error "Please define IDX_DTYPE parameter"
-#endif
-
-
-
-/**
- * Densify a matric from CSR format to "dense" 2D format.
- * The input CSR data consists in 3 arrays: (data, ind, iptr).
- * The output array is a 2D array of dimensions IMAGE_WIDTH * image_height.
- *
- * data: 1D array containing the nonzero data items.
- * ind: 1D array containing the column indices of the nonzero data items.
- * iptr: 1D array containing indirection indices, such that range
- *    [iptr[i], iptr[i+1]-1] of "data" and "ind" contain the relevant data
- *    of output row "i".
- * output: 2D array containing the densified data.
- * image_height: height (number of rows) of the output data.
-**/
-
-kernel void densify_csr(
-    const global DTYPE* data,
-    const global IDX_DTYPE* ind,
-    const global IDX_DTYPE* iptr,
-    global DTYPE* output,
-    int image_height
-)
-{
-    uint tid = get_local_id(0);
-    uint row_idx = get_global_id(1);
-    if ((tid >= IMAGE_WIDTH) || (row_idx >= image_height)) return;
-
-    local DTYPE line[IMAGE_WIDTH];
-
-    // Memset
-    //~ #pragma unroll
-    for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) {
-        if (tid+k >= IMAGE_WIDTH) break;
-        line[tid+k] = 0.0f;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-
-    uint start = iptr[row_idx], end = iptr[row_idx+1];
-    //~ #pragma unroll
-    for (int k = start; k < end; k += get_local_size(0)) {
-        // Current work group handles one line of the final array
-        // on the current line, write data[start+tid] at column index ind[start+tid]
-        if (k+tid < end)
-            line[ind[k+tid]] = data[k+tid];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // write the current line (shared mem) into the output array (global mem)
-    //~ #pragma unroll
-    for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) {
-        output[row_idx*IMAGE_WIDTH + tid+k] = line[tid+k];
-        if (k+tid >= IMAGE_WIDTH) return;
-    }
-}
diff --git a/silx/resources/opencl/statistics.cl b/silx/resources/opencl/statistics.cl
deleted file mode 100644
index 47d925b..0000000
--- a/silx/resources/opencl/statistics.cl
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- *   Project: Silx statics calculation
- *
- *
- *
- *   Copyright (C) 2012-2021 European Synchrotron Radiation Facility
- *                           Grenoble, France
- *
- *   Principal authors: J. Kieffer (kieffer@esrf.fr)
- *   Last revision: 17/05/2021
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-/**
- * \file
- *
- * \brief OpenCL kernels for min, max, mean and std calculation
- *
- * This module provides two functions to perform the `map` and the `reduce` 
- * to be used with pyopencl reduction to calculate in a single pass the minimum, 
- * maximum, sum, count, mean and standart deviation for an array.
- * 
- * So beside the reduction mechanisme from pyopencl, this algorithm implementes equations from
- * https://dbs.ifi.uni-heidelberg.de/files/Team/eschubert/publications/SSDBM18-covariance-authorcopy.pdf
- *
- * let A and B be 2 disjoint partition of all elements
- * 
- * Omega_A = sum_{i \in A}(omaga_i) The sum of all weights
- * V_A is the weighted sum of the signal over the partition
- * VV_A is the weighted sum of deviation squarred
- * 
- * With this the mean is V / Omega and the variance equals VV / omega.
- * 
- * Redction operator performs:
- * Omega_{AB} = Omega_A + Omega_B
- * V_{AB} = V_A + V_B
- * VV{AB} = VV_A + VV_B + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB})
- *
- * To avoid any numerical degradation, the doubleword library is used to perform all floating point operations. 
- *
- */
-#include "for_eclipse.h"
-
-/* \brief read a value at given position and initialize the float8 for the reduction
- *
- * The float8 returned contains:
- * s0: minimum value
- * s1: maximum value
- * s2: Omega_h count number of valid pixels
- * s3: Omega_l error associated to the count
- * s4: V_h sum of signal
- * s5: V_l error associated to the sum of signal
- * s6: VVh variance*count
- * s7: VVl error associated to variance*count
- *
- */
-static inline float8 map_statistics(global float* data, int position)
-{
-    float value = data[position];
-    float8 result;
-
-    if (isfinite(value))
-    {
-        result = (float8)(value, value, 1.0f, 0.0f, value, 0.0f, 0.0f, 0.0f);
-        //                min     max   cnt_h cnt_l sum_h  sum_l M2_h  M2_l
-    }
-    else
-    {
-        result = (float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
-        //                min        max     cnt_h cnt_l sum_h sum_l M2_h  M2_l
-    }
-    return result;
-}
-
-/* \brief reduction function associated to the statistics.
- *
- * this is described in:
- * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
- *
- * The float8 used here contain contains:
- * s0: minimum value
- * s1: maximum value
- * s2: count number of valid pixels
- * s3: count (error associated to)
- * s4: sum of valid pixels
- * s5: sum (error associated to)
- * s6: M=variance*(count-1)
- * s7: M=variance*(count-1) (error associated to)
- *
- */
-
-static inline float8 reduce_statistics(float8 a, float8 b)
-{
-    float2 sum_a, sum_b, M_a, M_b, count_a, count_b;
-    float2 count, sum;
-    float2 M, delta, delta2, omega3;
-    //test on count
-    if (a.s2 == 0.0f)
-    {
-        return b;
-    }
-    else
-    {
-        count_a = (float2)(a.s2, a.s3);
-        sum_a = (float2)(a.s4, a.s5);
-        M_a = (float2)(a.s6, a.s7);
-    }
-    //test on count
-    if (b.s2 == 0.0f)
-    {
-        return a;
-    }
-    else
-    {
-        count_b = (float2)(b.s2, b.s3);
-        sum_b = (float2)(b.s4, b.s5);
-        M_b = (float2)(b.s6, b.s7);
-    }
-    // count = count_a + count_b
-    count = dw_plus_dw(count_a, count_b);
-    // sum = sum_a + sum_b
-    sum = dw_plus_dw(sum_a, sum_b);
-    
-    // M2 = M_a + M_b + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB})
-    M =  dw_plus_dw(M_a, M_b);
-    delta = dw_plus_dw(dw_times_dw(count_b, sum_a),
-                      -dw_times_dw(count_a, sum_b));
-    delta2 = dw_times_dw(delta, delta);
-    omega3 = dw_times_dw(count, dw_times_dw(count_a, count_b));            
-    M = dw_plus_dw(M, dw_div_dw(delta2, omega3));
-                                    
-    float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1),
-                             count.s0,        count.s1,
-                             sum.s0,          sum.s1,
-                             M.s0,            M.s1);
-    return result;
-}
-
-/* \brief reduction function associated to the statistics without compensated arithmetics.
- *
- * this is described in:
- * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
- *
- * The float8 used here contain contains:
- * s0: minimum value
- * s1: maximum value
- * s2: count number of valid pixels (major)
- * s3: count number of valid pixels (minor)
- * s4: sum of valid pixels (major)
- * s5: sum of valid pixels (minor)
- * s6: variance*count (major)
- * s7: variance*count (minor)
- *
- */
-
-static inline float8 reduce_statistics_simple(float8 a, float8 b)
-{
-    float sum_a, sum_b, M_a, M_b, count_a, count_b;
-
-    //test on count
-    if (a.s2 == 0.0f)
-    {
-        return b;
-    }
-    else
-    {
-        count_a = a.s2;
-        sum_a = a.s4;
-        M_a = a.s6;
-    }
-    //test on count
-    if (b.s2 == 0.0f)
-    {
-        return a;
-    }
-    else
-    {
-        count_b = b.s2;
-        sum_b = b.s4;
-        M_b = b.s6;
-    }
-    float count = count_a + count_b;
-    float sum = sum_a + sum_b;
-    float delta = sum_a*count_b - sum_b*count_a;
-    float delta2 = delta * delta;
-    float M2 = M_a + M_b + delta2/(count*count_a*count_b);
-    float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1),
-                             count,        0.0f,
-                             sum,          0.0f,
-                             M2,           0.0f);
-    return result;
-}
-
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-
-// unpack a double in two floats such as the  sum of the two is the double number
-static inline float2 unpack_double(double inp){
-    float major = (float) inp;
-    float minor = (float) (inp - major);
-    return (float2)(major, minor);  
-}
-
-// pack two floats into a double
-static inline double pack_double(float major, float minor){
-    return (double)major + (double)minor;
-}
-
-/* \brief reduction function associated to the statistics using double precision arithmetics
- *
- * this is described in:
- * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
- *
- * The float8 used here contain contains:
- * s0: minimum value
- * s1: maximum value
- * s2: count number of valid pixels (major)
- * s3: count number of valid pixels (minor)
- * s4: sum of valid pixels (major)
- * s5: sum of valid pixels (minor)
- * s6: variance*count (major)
- * s7: variance*count (minor)
- *
- */
-
-static inline float8 reduce_statistics_double(float8 a, float8 b)
-{
-    double sum_a, sum_b, M_a, M_b, count_a, count_b;
-
-    //test on count
-    if (a.s2 == 0.0)
-    {
-        return b;
-    }
-    else
-    {
-        count_a = pack_double(a.s2, a.s3);
-        sum_a = pack_double(a.s4,a.s5);
-        M_a = pack_double(a.s6, a.s7);
-    }
-    //test on count
-    if (b.s2 == 0.0)
-    {
-        return a;
-    }
-    else
-    {
-        count_b = pack_double(b.s2, b.s3);
-        sum_b = pack_double(b.s4, b.s5);
-        M_b = pack_double(b.s6, b.s7);
-    }
-    double count = count_a + count_b;
-    double sum = sum_a + sum_b;
-    double delta = sum_a*count_b - sum_b*count_a;
-    double delta2 = delta * delta;
-    double M2 = M_a + M_b + delta2/(count*count_a*count_b);
-    float8 result = (float8)((float2)(min(a.s0, b.s0), max(a.s1, b.s1)),
-                                unpack_double(count),
-                                unpack_double( sum),
-                                unpack_double( M2));
-    return result;
-}
-
-#endif
-\ No newline at end of file