6 files changed, 1029 insertions, 3 deletions
diff --git a/silx/resources/opencl/addition.cl b/silx/resources/opencl/addition.cl
index 8ecfd4e..35d7996 100644
--- a/silx/resources/opencl/addition.cl
+++ b/silx/resources/opencl/addition.cl
@@ -27,10 +27,16 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  * OTHER DEALINGS IN THE SOFTWARE.
  */
-__kernel void addition(__global float* a, __global float* b, __global float* res, int N)
+
+
+// "Hello_world" kernel to test if OpenCL is actually working
+kernel void addition(global float* a,
+                     global float* b,
+                     global float* res,
+                     int N)
 {
-    unsigned int i = get_global_id(0);
+    int i = get_global_id(0);
     if( i<N ){
         res[i] = a[i] + b[i];
     }
-}
-\ No newline at end of file
+}
diff --git a/silx/resources/opencl/array_utils.cl b/silx/resources/opencl/array_utils.cl
new file mode 100644
index 0000000..60677dc
--- /dev/null
+++ b/silx/resources/opencl/array_utils.cl
@@ -0,0 +1,33 @@
+/**
+ *  2D Memcpy for float* arrays,
+ * replacing pyopencl "enqueue_copy" which does not return the expected result
+ * when dealing with rectangular buffers.
+ * ALL THE SIZES/OFFSETS ARE SPECIFIED IN PIXELS, NOT IN BYTES.
+ * In the (x, y) convention, x is the fast index (as in CUDA).
+ *
+ * :param dst: destination array
+ * :param src: source array
+ * :param dst_width: width of the dst array
+ * :param src_width: width of the src array
+ * :param dst_offset: tuple with the offset (x, y) in the dst array
+ * :param src_offset: tuple with the offset (x, y) in the src array
+ * :param transfer_shape: shape of the transfer array in the form (x, y)
+ *
+ */
+kernel void cpy2d(
+                    global float* dst,
+                    global float* src,
+                    int dst_width,
+                    int src_width,
+                    int2 dst_offset,
+                    int2 src_offset,
+                    int2 transfer_shape)
+{
+    int gidx = get_global_id(0),
+        gidy = get_global_id(1);
+    if (gidx < transfer_shape.x && gidy < transfer_shape.y)
+    {
+        dst[(dst_offset.y + gidy)*dst_width + (dst_offset.x + gidx)] = src[(src_offset.y + gidy)*src_width + (src_offset.x + gidx)];
+    }
+}
+
diff --git a/silx/resources/opencl/backproj.cl b/silx/resources/opencl/backproj.cl
new file mode 100644
index 0000000..6fadc2c
--- /dev/null
+++ b/silx/resources/opencl/backproj.cl
@@ -0,0 +1,485 @@
+/*
+ *   Project: silx: filtered backprojection
+ *
+ *   Copyright (C) 2016-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: A. Mirone
+ *                      P. Paleo
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/*******************************************************************************/
+/************************ GPU VERSION (with textures) **************************/
+/*******************************************************************************/
+
+
+kernel void backproj_kernel(
+    int num_proj,
+    int num_bins,
+    float axis_position,
+    global float *d_SLICE,
+    read_only image2d_t d_sino,
+    float gpu_offset_x,
+    float gpu_offset_y,
+    global float * d_cos_s,  // precalculated cos(theta[i])
+    global float* d_sin_s,   // precalculated sin(theta[i])
+    global float* d_axis_s,  // array of axis positions (n_projs)
+    local float* shared2)    // 768B of local mem
+{
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+    const int tidx = get_local_id(0); //threadIdx.x;
+    const int bidx = get_group_id(0); //blockIdx.x;
+    const int tidy = get_local_id(1); //threadIdx.y;
+    const int bidy = get_group_id(1); //blockIdx.y;
+
+    //~ local float shared[768];
+    //~ float  * sh_sin  = shared;
+    //~ float  * sh_cos  = shared+256;
+    //~ float  * sh_axis = sh_cos+256;
+
+    local float sh_cos[256];
+    local float sh_sin[256];
+    local float sh_axis[256];
+
+    float pcos, psin;
+    float h0, h1, h2, h3;
+    const float apos_off_x= gpu_offset_x - axis_position ;
+    const float apos_off_y= gpu_offset_y - axis_position ;
+    float acorr05;
+    float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
+
+    const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x  ) ;
+    const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y  ) ;
+
+    int read=0;
+    for(int proj=0; proj<num_proj; proj++) {
+        if(proj>=read) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ip = tidy*16+tidx;
+            if( read+ip < num_proj) {
+                sh_cos [ip] = d_cos_s[read+ip] ;
+                sh_sin [ip] = d_sin_s[read+ip] ;
+                sh_axis[ip] = d_axis_s[read+ip] ;
+            }
+            read=read+256; // 256=16*16 block size
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        pcos = sh_cos[256-read + proj] ;
+        psin = sh_sin[256-read + proj] ;
+
+        acorr05 = sh_axis[256 - read + proj] ;
+
+        h0 = (acorr05 + bx00*pcos - by00*psin);
+        h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
+        h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
+        h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
+
+        if(h0>=0 && h0<num_bins) res0 += read_imagef(d_sino, sampler, (float2) (h0 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h0 +0.5f,proj +0.5f);
+        if(h1>=0 && h1<num_bins) res1 += read_imagef(d_sino, sampler, (float2) (h1 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h1 +0.5f,proj +0.5f);
+        if(h2>=0 && h2<num_bins) res2 += read_imagef(d_sino, sampler, (float2) (h2 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h2 +0.5f,proj +0.5f);
+        if(h3>=0 && h3<num_bins) res3 += read_imagef(d_sino, sampler, (float2) (h3 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h3 +0.5f,proj +0.5f);
+    }
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
+}
+
+
+
+
+
+/*******************************************************************************/
+/********************* CPU VERSION (without textures) **************************/
+/*******************************************************************************/
+
+
+#define CLIP_MAX(x, N) (fmin(fmax(x, 0.0f), (N - 1.0f)))
+
+#define FLOORCEIL_x(x) {\
+    xm = (int) floor(x);\
+    xp = (int) ceil(x);\
+}
+
+#define ADJACENT_PIXELS_VALS(arr, Nx, y, xm, xp) ((float2) (arr[y*Nx+xm], arr[y*Nx+xp]))
+
+//Simple linear interpolator for working on the GPU
+static float linear_interpolation(float2 vals,
+                                  float x,
+                                  int xm,
+                                  int xp)
+{
+    if (xm == xp)
+        return vals.s0;
+    else 
+        return (vals.s0 * (xp - x)) + (vals.s1 * (x - xm));
+}
+
+/**
+ *
+ *  Same kernel as backproj_kernel, but targets the CPU (no texture)
+ *
+**/
+kernel void backproj_cpu_kernel(
+    int num_proj,
+    int num_bins,
+    float axis_position,
+    global float *d_SLICE,
+    global float* d_sino,
+    float gpu_offset_x,
+    float gpu_offset_y,
+    global float * d_cos_s, // precalculated cos(theta[i])
+    global float * d_sin_s, // precalculated sin(theta[i])
+    global float * d_axis_s, // array of axis positions (n_projs)
+    local float* shared2)     // 768B of local mem
+{
+    const int tidx = get_local_id(0); //threadIdx.x;
+    const int bidx = get_group_id(0); //blockIdx.x;
+    const int tidy = get_local_id(1); //threadIdx.y;
+    const int bidy = get_group_id(1); //blockIdx.y;
+
+    local float sh_cos[256];
+    local float sh_sin[256];
+    local float sh_axis[256];
+
+    float pcos, psin;
+    float h0, h1, h2, h3;
+    const float apos_off_x= gpu_offset_x - axis_position ;
+    const float apos_off_y= gpu_offset_y - axis_position ;
+    float acorr05;
+    float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
+
+    const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x  ) ;
+    const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y  ) ;
+
+    int read=0;
+    for(int proj=0; proj<num_proj; proj++) {
+        if(proj>=read) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ip = tidy*16+tidx;
+            if( read+ip < num_proj) {
+                sh_cos [ip] = d_cos_s[read+ip] ;
+                sh_sin [ip] = d_sin_s[read+ip] ;
+                sh_axis[ip] = d_axis_s[read+ip] ;
+            }
+            read=read+256; // 256=16*16 block size
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        pcos = sh_cos[256-read + proj] ;
+        psin = sh_sin[256-read + proj] ;
+
+        acorr05 = sh_axis[256 - read + proj] ;
+
+        h0 = (acorr05 + bx00*pcos - by00*psin);
+        h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
+        h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
+        h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
+	
+	
+	float x;
+	int ym, xm, xp;
+	ym = proj;
+	float2 vals;
+
+	if(h0>=0 && h0<num_bins) { 
+	    x = CLIP_MAX(h0, num_bins); 
+	    FLOORCEIL_x(x);
+	    vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+	    res0 += linear_interpolation(vals, x, xm, xp);
+	}
+        if(h1>=0 && h1<num_bins) {
+	    x = CLIP_MAX(h1, num_bins); 
+	    FLOORCEIL_x(x);
+	    vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+	    res1 += linear_interpolation(vals, x, xm, xp);
+        }
+        if(h2>=0 && h2<num_bins) {
+	    x = CLIP_MAX(h2, num_bins); 
+	    FLOORCEIL_x(x);
+	    vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+	    res2 += linear_interpolation(vals, x, xm, xp);
+        }
+        if(h3>=0 && h3<num_bins) {
+	    x = CLIP_MAX(h3, num_bins); 
+	    FLOORCEIL_x(x);
+	    vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+	    res3 += linear_interpolation(vals, x, xm, xp);
+        }
+    }
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
+}
+
+
+
+
+
+
+
+/*******************************************************************************/
+/************************** OLD STUFF, for tinkering  **************************/
+/*******************************************************************************/
+
+
+
+
+/// arr(xm, ym), arr(xm, yp), arr(xp, yp), arr(xp, ym)
+//~ #define ADJACENT_PIXELS_VALS2(arr, Nx, xm, xp, ym, yp) ((float4) (arr[ym*Nx + xm], arr[yp*Nx + xm], arr[yp*Nx + xp], arr[ym*Nx + xp]))
+
+
+/**  xm, xp, ym, yp  **/
+//~ #define ADJACENT_PIXELS_COORDS(x, y) ((int4)((int) floor(x), (int) ceil(x), (int) floor(y), (int) ceil(y)))
+
+/**
+        (xm, ym)        (xp, ym)
+                 (x, y)
+        (xm, yp)        (xp, yp)
+**/
+/// arr(xm, ym), arr(xm, yp), arr(xp, yp), arr(xp, ym)
+//~ #define ADJACENT_PIXELS_VALS(arr, Nx, coords) ((float4) (arr[coords.s2*Nx + coords.s0], arr[coords.s3*Nx + coords.s0], arr[coords.s3*Nx + coords.s1], arr[coords.s2*Nx + coords.s1]))
+
+
+/**  xm, xp  **/
+//~ #define ADJACENT_PIXELS_COORDS2(x) ((int2)((int) floor(x), (int) ceil(x)))
+
+
+
+/*
+float bilinear_interpolation(
+    float x,  // x position in the image
+    float y,  // y position in the image
+    int Nx,   // image width 
+    int Ny,   // image height
+    int4 adj_coords, 
+    float4 adj_vals
+) {
+    float val;
+    float tol = 0.001f; // CHECKME
+    val = y - adj_coords.s2;
+    if ((x - adj_coords.s0) < tol && (y - adj_coords.s2) < tol) val = adj_vals.s0; 
+    else if ((adj_coords.s1 - x) < tol && (adj_coords.s3 - y) < tol) val = adj_vals.s2; 
+    else {
+        // Mirror - TODO: clamp ?
+        if (adj_coords.s0 < 0) adj_coords.s0 = 0;
+        if (adj_coords.s1 >= Nx) adj_coords.s1 = Nx - 1;
+        if (adj_coords.s2 < 0) adj_coords.s2 = 0;
+        if (adj_coords.s3 >= Ny) adj_coords.s3 = Ny -1;
+	if (adj_coords.s0 >= Nx) adj_coords.s0 = Nx - 1;
+	if (adj_coords.s2 >= Ny) adj_coords.s2 = Ny -1;
+        // Interp
+        val = adj_vals.s1*(adj_coords.s1-x)*(y-adj_coords.s2)
+                    + adj_vals.s2 *(x-adj_coords.s0)*(y-adj_coords.s2)
+                    + adj_vals.s0 *(adj_coords.s1-x)*(adj_coords.s3-y) 
+                    + adj_vals.s3 *(x-adj_coords.s0)*(adj_coords.s3-y); 
+
+    }
+    return val;
+}
+*/
+
+
+/*
+__kernel void backproj_cpu_kernel_good(
+    int num_proj,
+    int num_bins,
+    float axis_position,
+    __global float *d_SLICE,
+    __global float* d_sino,
+    float gpu_offset_x,
+    float gpu_offset_y,
+    __global float * d_cos_s, // precalculated cos(theta[i])
+    __global float * d_sin_s, // precalculated sin(theta[i])
+    __global float * d_axis_s, // array of axis positions (n_projs)
+    __local float* shared2)     // 768B of local mem
+{
+    const int tidx = get_local_id(0); //threadIdx.x;
+    const int bidx = get_group_id(0); //blockIdx.x;
+    const int tidy = get_local_id(1); //threadIdx.y;
+    const int bidy = get_group_id(1); //blockIdx.y;
+
+    //~ __local float shared[768];
+    //~ float  * sh_sin  = shared;
+    //~ float  * sh_cos  = shared+256;
+    //~ float  * sh_axis = sh_cos+256;
+
+    __local float sh_cos[256];
+    __local float sh_sin[256];
+    __local float sh_axis[256];
+
+    float pcos, psin;
+    float h0, h1, h2, h3;
+    const float apos_off_x= gpu_offset_x - axis_position ;
+    const float apos_off_y= gpu_offset_y - axis_position ;
+    float acorr05;
+    float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
+
+    const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x  ) ;
+    const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y  ) ;
+
+    int read=0;
+    for(int proj=0; proj<num_proj; proj++) {
+        if(proj>=read) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ip = tidy*16+tidx;
+            if( read+ip < num_proj) {
+                sh_cos [ip] = d_cos_s[read+ip] ;
+                sh_sin [ip] = d_sin_s[read+ip] ;
+                sh_axis[ip] = d_axis_s[read+ip] ;
+            }
+            read=read+256; // 256=16*16 block size
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        pcos = sh_cos[256-read + proj] ;
+        psin = sh_sin[256-read + proj] ;
+
+        acorr05 = sh_axis[256 - read + proj] ;
+
+        h0 = (acorr05 + bx00*pcos - by00*psin);
+        h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
+        h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
+        h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
+	
+	
+	float x, val;
+	float tol = 0.001f; // CHECKME
+	float y = proj + 0.5f;
+	int ym = (int) floor(y);
+	int yp = (int) ceil(y);
+	int xm, xp;
+	
+	//
+	int i0, i1, j0, j1;
+	float d0, d1, x0, x1, y0, y1;
+	d0 = fmin(fmax(proj+0*0.5f, 0.0f), (num_proj - 1.0f));
+	x0 = floor(d0);
+	x1 = ceil(d0);
+	i0 = (int) x0;
+	i1 = (int) x1;
+
+	if(h0>=0 && h0<num_bins) { 
+            d1 = fmin(fmax(h0+0*0.5f, 0.0f), (num_bins - 1.0f));
+	    y0 = floor(d1);
+	    y1 = ceil(d1);
+	    j0 = (int) y0;
+	    j1 = (int) y1;
+	    
+	    if ((i0 == i1) && (j0 == j1))
+		val = d_sino[i0*num_bins + j0]; //self.data[i0, j0]
+	    else if (i0 == i1)
+		val = (d_sino[i0*num_bins + j0] * (y1 - d1)) + (d_sino[i0*num_bins + j1] * (d1 - y0)); // self.data[i0, j0], self.data[i0, j1]
+	    else if (j0 == j1)
+		val = (d_sino[i0*num_bins + j0] * (x1 - d0)) + (d_sino[i1*num_bins + j0] * (d0 - x0)); // i0, j0  ;  i1, j0
+	    else
+		val = (d_sino[i0*num_bins + j0] * (x1 - d0) * (y1 - d1))  // i0, j0
+		    + (d_sino[i1*num_bins + j0] * (d0 - x0) * (y1 - d1))  // i1, j0
+		    + (d_sino[i0*num_bins + j1] * (x1 - d0) * (d1 - y0))  // i0, j1
+		    + (d_sino[i1*num_bins + j1] * (d0 - x0) * (d1 - y0));  // i1, j1
+
+	    res0 += val;
+	}
+        if(h1>=0 && h1<num_bins) {
+	    //~ int4 coords = ADJACENT_PIXELS_COORDS(h1 +0.5f, proj +0.5f);
+	    //~ res1 += bilinear_interpolation(h1 +0.5f, proj +0.5f, num_bins, num_proj, coords, ADJACENT_PIXELS_VALS(d_sino, num_bins, coords)); //tex2D(texProjes,h1 +0.5f,proj +0.5f);
+            d1 = fmin(fmax(h1+0*0.5f, 0.0f), (num_bins - 1.0f));
+	    y0 = floor(d1);
+	    y1 = ceil(d1);
+	    j0 = (int) y0;
+	    j1 = (int) y1;
+	    
+	    if ((i0 == i1) && (j0 == j1))
+		val = d_sino[i0*num_bins + j0]; //self.data[i0, j0]
+	    else if (i0 == i1)
+		val = (d_sino[i0*num_bins + j0] * (y1 - d1)) + (d_sino[i0*num_bins + j1] * (d1 - y0)); // self.data[i0, j0], self.data[i0, j1]
+	    else if (j0 == j1)
+		val = (d_sino[i0*num_bins + j0] * (x1 - d0)) + (d_sino[i1*num_bins + j0] * (d0 - x0)); // i0, j0  ;  i1, j0
+	    else
+		val = (d_sino[i0*num_bins + j0] * (x1 - d0) * (y1 - d1))  // i0, j0
+		    + (d_sino[i1*num_bins + j0] * (d0 - x0) * (y1 - d1))  // i1, j0
+		    + (d_sino[i0*num_bins + j1] * (x1 - d0) * (d1 - y0))  // i0, j1
+		    + (d_sino[i1*num_bins + j1] * (d0 - x0) * (d1 - y0));  // i1, j1
+
+
+	    res1 += val;
+        }
+        if(h2>=0 && h2<num_bins) {
+	    //~ int4 coords = ADJACENT_PIXELS_COORDS(h2 +0.5f, proj +0.5f);
+	    //~ res2 += 0; //bilinear_interpolation(h2 +0.5f, proj +0.5f, num_bins, num_proj, coords, ADJACENT_PIXELS_VALS(d_sino, num_bins, coords)); //tex2D(texProjes,h2 +0.5f,proj +0.5f);
+            d1 = fmin(fmax(h2+0*0.5f, 0.0f), (num_bins - 1.0f));
+	    y0 = floor(d1);
+	    y1 = ceil(d1);
+	    j0 = (int) y0;
+	    j1 = (int) y1;
+	    
+	    if ((i0 == i1) && (j0 == j1))
+		val = d_sino[i0*num_bins + j0]; //self.data[i0, j0]
+	    else if (i0 == i1)
+		val = (d_sino[i0*num_bins + j0] * (y1 - d1)) + (d_sino[i0*num_bins + j1] * (d1 - y0)); // self.data[i0, j0], self.data[i0, j1]
+	    else if (j0 == j1)
+		val = (d_sino[i0*num_bins + j0] * (x1 - d0)) + (d_sino[i1*num_bins + j0] * (d0 - x0)); // i0, j0  ;  i1, j0
+	    else
+		val = (d_sino[i0*num_bins + j0] * (x1 - d0) * (y1 - d1))  // i0, j0
+		    + (d_sino[i1*num_bins + j0] * (d0 - x0) * (y1 - d1))  // i1, j0
+		    + (d_sino[i0*num_bins + j1] * (x1 - d0) * (d1 - y0))  // i0, j1
+		    + (d_sino[i1*num_bins + j1] * (d0 - x0) * (d1 - y0));  // i1, j1
+
+	    res2+= val;
+        }
+        if(h3>=0 && h3<num_bins) {
+	    //~ int4 coords = ADJACENT_PIXELS_COORDS(h3 +0.5f, proj +0.5f);
+	    //~ res3 += 0; //bilinear_interpolation(h3 +0.5f, proj +0.5f, num_bins, num_proj, coords, ADJACENT_PIXELS_VALS(d_sino, num_bins, coords)); //tex2D(texProjes,h3 +0.5f,proj +0.5f);
+            d1 = fmin(fmax(h3+0*0.5f, 0.0f), (num_bins - 1.0f));
+	    y0 = floor(d1);
+	    y1 = ceil(d1);
+	    j0 = (int) y0;
+	    j1 = (int) y1;
+	    
+	    if ((i0 == i1) && (j0 == j1))
+		val = d_sino[i0*num_bins + j0]; //self.data[i0, j0]
+	    else if (i0 == i1)
+		val = (d_sino[i0*num_bins + j0] * (y1 - d1)) + (d_sino[i0*num_bins + j1] * (d1 - y0)); // self.data[i0, j0], self.data[i0, j1]
+	    else if (j0 == j1)
+		val = (d_sino[i0*num_bins + j0] * (x1 - d0)) + (d_sino[i1*num_bins + j0] * (d0 - x0)); // i0, j0  ;  i1, j0
+	    else
+		val = (d_sino[i0*num_bins + j0] * (x1 - d0) * (y1 - d1))  // i0, j0
+		    + (d_sino[i1*num_bins + j0] * (d0 - x0) * (y1 - d1))  // i1, j0
+		    + (d_sino[i0*num_bins + j1] * (x1 - d0) * (d1 - y0))  // i0, j1
+		    + (d_sino[i1*num_bins + j1] * (d0 - x0) * (d1 - y0));  // i1, j1
+
+	    res3 += val;
+        }
+    }
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
+    d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
+}
+*/
+
+
+
+
diff --git a/silx/resources/opencl/backproj_helper.cl b/silx/resources/opencl/backproj_helper.cl
new file mode 100644
index 0000000..b1590f8
--- /dev/null
+++ b/silx/resources/opencl/backproj_helper.cl
@@ -0,0 +1,68 @@
+/*
+ *   Project: silx: backprojection helper functions
+ *
+ *   Copyright (C) 2016-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: P. Paleo
+ *                      J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+kernel void mult(   global float2* d_sino,
+                    global float2* d_filter,
+                    int num_bins,
+                    int num_projs)
+{
+    int gid0 = get_global_id(0);
+    int gid1 = get_global_id(1);
+    if (gid0 < num_bins && gid1 < num_projs)
+    {
+        // d_sino[gid1*num_bins+gid0] *= d_filter[gid0];
+        d_sino[gid1*num_bins+gid0].x *= d_filter[gid0].x;
+        d_sino[gid1*num_bins+gid0].y *= d_filter[gid0].x;
+      }
+}
+
+// copy only the real part of the valid data to the real array
+kernel void cpy2d_c2r(
+                      global float* d_sino,
+                      global float2* d_sino_complex,
+                      int num_bins,
+                      int num_projs,
+                      int fft_size)
+{
+    int gid0 = get_global_id(0);
+    int gid1 = get_global_id(1);
+    if (gid0 < num_bins && gid1 < num_projs) {
+        d_sino[gid1*num_bins+gid0] = d_sino_complex[gid1*fft_size+gid0].x;
+    }
+}
+
+
+
+
+
+
+
diff --git a/silx/resources/opencl/linalg.cl b/silx/resources/opencl/linalg.cl
new file mode 100644
index 0000000..82a76eb
--- /dev/null
+++ b/silx/resources/opencl/linalg.cl
@@ -0,0 +1,89 @@
+/*
+ *   Copyright (C) 2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+/**
+ *
+ * Compute the spatial gradient of an image.
+ *
+ * slice: input image
+ * slice_grad: output gradient
+ * sizeX: number of columns of the image
+ * sizeY: number of rows of the image
+ *
+ **/
+__kernel void kern_gradient2D(
+    __global float* slice,
+    __global float2* slice_grad,
+    int sizeX,
+    int sizeY)
+{
+
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    float val_x = 0, val_y = 0;
+
+    if (gidx < sizeX && gidy < sizeY) {
+        if (gidx == sizeX-1) val_y = 0;
+        else val_y = slice[(gidy)*sizeX+gidx+1] - slice[(gidy)*sizeX+gidx];
+        if (gidy == sizeY-1) val_x = 0;
+        else val_x = slice[(gidy+1)*sizeX+gidx] - slice[(gidy)*sizeX+gidx];
+
+        slice_grad[(gidy)*sizeX+gidx].x = val_x;
+        slice_grad[(gidy)*sizeX+gidx].y = val_y;
+    }
+}
+
+/**
+ *
+ * Compute the spatial divergence of an image gradient.
+ *
+ * slice_grad: input gradient-like image
+ * slice: output image
+ * sizeX: number of columns of the input
+ * sizeY: number of rows of the input
+ *
+ **/
+__kernel void kern_divergence2D(
+    __global float2* slice_grad,
+    __global float* slice,
+    int sizeX,
+    int sizeY)
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    float val_x = 0, val_y = 0;
+
+    if (gidx < sizeX && gidy < sizeY) {
+        if (gidx == 0) val_y = slice_grad[(gidy)*sizeX+gidx].y;
+        else val_y = slice_grad[(gidy)*sizeX+gidx].y - slice_grad[(gidy)*sizeX+gidx-1].y;
+        if (gidy == 0) val_x = slice_grad[(gidy)*sizeX+gidx].x;
+        else val_x = slice_grad[(gidy)*sizeX+gidx].x - slice_grad[(gidy-1)*sizeX+gidx].x;
+        slice[(gidy)*sizeX+gidx] = val_x + val_y;
+    }
+}
+
+
+
+
diff --git a/silx/resources/opencl/proj.cl b/silx/resources/opencl/proj.cl
new file mode 100644
index 0000000..afc58ff
--- /dev/null
+++ b/silx/resources/opencl/proj.cl
@@ -0,0 +1,345 @@
+/*
+ *   Copyright (C) 2017-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ * Based on the projector of PyHST2 - https://forge.epn-campus.eu/projects/pyhst2
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+/*******************************************************************************/
+/************************ GPU VERSION (with textures) **************************/
+/*******************************************************************************/
+
+
+kernel void  forward_kernel(
+        global float *d_Sino,
+        read_only image2d_t d_slice,
+        int dimslice,
+        int num_bins,
+        global float* angles_per_project ,
+        float axis_position,
+        global float *d_axis_corrections,
+        global int* d_beginPos    ,
+        global int* d_strideJoseph,
+        global int* d_strideLine  ,
+        int num_projections,
+        int  dimrecx,
+        int  dimrecy,
+        float cpu_offset_x,
+        float cpu_offset_y,
+        int josephnoclip,
+        int normalize)
+{
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+    const int tidx = get_local_id(0);
+    const int bidx = get_group_id(0);
+    const int tidy = get_local_id(1);
+    const int bidy = get_group_id(1);
+    float angle;
+    float cos_angle,sin_angle ;
+
+    local float corrections[16];
+    local int beginPos[16*2];
+    local int strideJoseph[16*2];
+    local int strideLine[16*2];
+
+    // thread will use corrections[tidy]
+    // All are read by first warp
+    int offset, OFFSET;
+    switch(tidy) {
+    case 0:
+        corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx];
+        break;
+    case 1:
+    case 2:
+        offset = 16*(tidy-1);
+        OFFSET = dimrecy*(tidy-1);
+        beginPos    [offset + tidx ]=  d_beginPos[ OFFSET+ bidy*16+tidx]  ;
+        break;
+    case 3:
+    case 4:
+        offset = 16*(tidy-3);
+        OFFSET = dimrecy*(tidy-3);
+        strideJoseph[offset + tidx ]=  d_strideJoseph[OFFSET + bidy*16+tidx]  ;
+        break;
+    case 5:
+    case 6:
+        offset = 16*(tidy-5);
+        OFFSET = dimrecy*(tidy-5);
+        strideLine[offset + tidx ]=  d_strideLine[OFFSET + bidy*16+tidx]  ;
+        break;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    angle = angles_per_project[ bidy*16+tidy ]   ;
+    cos_angle = cos(angle);
+    sin_angle = sin(angle);
+
+    if(fabs(cos_angle) > 0.70710678f ) {
+        if( cos_angle>0) {
+            cos_angle = cos(angle);
+            sin_angle = sin(angle);
+        }
+        else {
+            cos_angle = -cos(angle);
+            sin_angle = -sin(angle);
+        }
+    }
+    else {
+        if( sin_angle>0) {
+            cos_angle =  sin(angle);
+            sin_angle = -cos(angle);
+        }
+        else {
+            cos_angle = -sin(angle);
+            sin_angle =  cos(angle);
+        }
+    }
+    float res=0.0f;
+    float axis_corr = axis_position  + corrections[ tidy ];
+    float axis      = axis_position ;
+    float xpix = ( bidx*16+tidx )-cpu_offset_x;
+    float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ;
+
+    float shiftJ = sin_angle/cos_angle;
+    float x1 = min(-sin_angle/cos_angle ,0.f);
+    float x2 = max(-sin_angle/cos_angle ,0.f);
+
+    float  Area;
+    Area=1.0f/cos_angle;
+    int stlA, stlB , stlAJ, stlBJ ;
+    stlA=strideLine[16+tidy];
+    stlB=strideLine[tidy];
+    stlAJ=strideJoseph[16+tidy];
+    stlBJ=strideJoseph[tidy];
+
+    int beginA = beginPos[16+tidy ];
+    int beginB = beginPos[tidy ];
+    float add;
+    int l;
+
+    if(josephnoclip) {
+        for(int j=0; j<dimslice; j++) {  // j: Joseph
+            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
+            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
+            add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2);
+            res += add;
+            posx += shiftJ;
+        }
+    }
+    else {
+        for(int j=0; j<dimslice; j++) {  // j: Joseph
+            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
+            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
+            l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ;
+            add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2);
+            res += add*l;
+            posx += shiftJ;
+        }
+    }
+
+    if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) {
+        res *= Area;
+        if (normalize)
+            res *= M_PI_F * 0.5f / num_projections;
+        d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res;
+    }
+}
+
+
+
+/*******************************************************************************/
+/********************* CPU VERSION (without textures) **************************/
+/*******************************************************************************/
+
+
+kernel void  forward_kernel_cpu(
+        global float *d_Sino,
+        global float* d_slice,
+        int dimslice,
+        int num_bins,
+        global float* angles_per_project ,
+        float axis_position,
+        global float *d_axis_corrections,
+        global int* d_beginPos    ,
+        global int* d_strideJoseph,
+        global int* d_strideLine  ,
+        int num_projections,
+        int  dimrecx,
+        int  dimrecy,
+        float cpu_offset_x,
+        float cpu_offset_y,
+        int josephnoclip,
+        int normalize)
+{
+
+    const int tidx = get_local_id(0);
+    const int bidx = get_group_id(0);
+    const int tidy = get_local_id(1);
+    const int bidy = get_group_id(1);
+    float angle;
+    float cos_angle,sin_angle ;
+
+    local float corrections[16];
+    local int beginPos[16*2];
+    local int strideJoseph[16*2];
+    local int strideLine[16*2];
+
+    // thread will use corrections[tidy]
+    // All are read by first warp
+    int offset, OFFSET;
+    switch(tidy) {
+    case 0:
+        corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx];
+        break;
+    case 1:
+    case 2:
+        offset = 16*(tidy-1);
+        OFFSET = dimrecy*(tidy-1);
+        beginPos    [offset + tidx ]=  d_beginPos[ OFFSET+ bidy*16+tidx]  ;
+        break;
+    case 3:
+    case 4:
+        offset = 16*(tidy-3);
+        OFFSET = dimrecy*(tidy-3);
+        strideJoseph[offset + tidx ]=  d_strideJoseph[OFFSET + bidy*16+tidx]  ;
+        break;
+    case 5:
+    case 6:
+        offset = 16*(tidy-5);
+        OFFSET = dimrecy*(tidy-5);
+        strideLine[offset + tidx ]=  d_strideLine[OFFSET + bidy*16+tidx]  ;
+        break;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    angle = angles_per_project[ bidy*16+tidy ]   ;
+    cos_angle = cos(angle);
+    sin_angle = sin(angle);
+
+    if(fabs(cos_angle) > 0.70710678f ) {
+        if( cos_angle>0) {
+            cos_angle = cos(angle);
+            sin_angle = sin(angle);
+        }
+        else {
+            cos_angle = -cos(angle);
+            sin_angle = -sin(angle);
+        }
+    }
+    else {
+        if( sin_angle>0) {
+            cos_angle =  sin(angle);
+            sin_angle = -cos(angle);
+        }
+        else {
+            cos_angle = -sin(angle);
+            sin_angle =  cos(angle);
+        }
+    }
+    float res=0.0f;
+    float axis_corr = axis_position  + corrections[ tidy ];
+    float axis      = axis_position ;
+    float xpix = ( bidx*16+tidx )-cpu_offset_x;
+    float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ;
+
+    float shiftJ = sin_angle/cos_angle;
+    float x1 = min(-sin_angle/cos_angle ,0.f);
+    float x2 = max(-sin_angle/cos_angle ,0.f);
+
+    float  Area;
+    Area=1.0f/cos_angle;
+    int stlA, stlB , stlAJ, stlBJ ;
+    stlA=strideLine[16+tidy];
+    stlB=strideLine[tidy];
+    stlAJ=strideJoseph[16+tidy];
+    stlBJ=strideJoseph[tidy];
+
+    int beginA = beginPos[16+tidy ];
+    int beginB = beginPos[tidy ];
+    int l;
+
+    int ym, yp, xm, xp;
+    float yc, xc;
+    float val;
+    if(josephnoclip) {
+        for(int j=0; j<dimslice; j++) {  // j: Joseph
+            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.0f;
+            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.0f;
+            /*
+              Bilinear interpolation
+            */
+            yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped
+            ym = (int) floor(yc); // y_minus
+            yp = (int) ceil(yc);  // y_plus
+
+            xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f));  // x_clipped
+            xm = (int) floor(xc); // x_minus
+            xp = (int) ceil(xc);  // x_plus
+
+            if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm];
+            else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm));
+            else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym));
+            else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc))
+                       + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc))
+                       + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm))
+                       + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm));
+            // ----------
+            res += val;
+            posx += shiftJ;
+        }
+    }
+    else {
+        for(int j=0; j<dimslice; j++) {  // j: Joseph
+            x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
+            x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
+            l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ;
+            /*
+              Bilinear interpolation
+            */
+            yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped
+            ym = (int) floor(yc); // y_minus
+            yp = (int) ceil(yc);  // y_plus
+
+            xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f));  // x_clipped
+            xm = (int) floor(xc); // x_minus
+            xp = (int) ceil(xc);  // x_plus
+
+            if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm];
+            else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm));
+            else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym));
+            else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc))
+                       + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc))
+                       + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm))
+                       + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm));
+            // ----------
+            res += val*l;
+            posx += shiftJ;
+        }
+    }
+
+    if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) {
+        res *= Area;
+        if (normalize)
+            res *= M_PI_F * 0.5f / num_projections;
+        d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res;
+    }
+}