diff options
Diffstat (limited to 'silx/resources/opencl')
-rw-r--r-- | silx/resources/opencl/addition.cl | 12 | ||||
-rw-r--r-- | silx/resources/opencl/array_utils.cl | 33 | ||||
-rw-r--r-- | silx/resources/opencl/backproj.cl | 485 | ||||
-rw-r--r-- | silx/resources/opencl/backproj_helper.cl | 68 | ||||
-rw-r--r-- | silx/resources/opencl/linalg.cl | 89 | ||||
-rw-r--r-- | silx/resources/opencl/proj.cl | 345 |
6 files changed, 1029 insertions, 3 deletions
diff --git a/silx/resources/opencl/addition.cl b/silx/resources/opencl/addition.cl index 8ecfd4e..35d7996 100644 --- a/silx/resources/opencl/addition.cl +++ b/silx/resources/opencl/addition.cl @@ -27,10 +27,16 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ -__kernel void addition(__global float* a, __global float* b, __global float* res, int N) + + +// "Hello_world" kernel to test if OpenCL is actually working +kernel void addition(global float* a, + global float* b, + global float* res, + int N) { - unsigned int i = get_global_id(0); + int i = get_global_id(0); if( i<N ){ res[i] = a[i] + b[i]; } -}
\ No newline at end of file +} diff --git a/silx/resources/opencl/array_utils.cl b/silx/resources/opencl/array_utils.cl new file mode 100644 index 0000000..60677dc --- /dev/null +++ b/silx/resources/opencl/array_utils.cl @@ -0,0 +1,33 @@ +/** + * 2D Memcpy for float* arrays, + * replacing pyopencl "enqueue_copy" which does not return the expected result + * when dealing with rectangular buffers. + * ALL THE SIZES/OFFSETS ARE SPECIFIED IN PIXELS, NOT IN BYTES. + * In the (x, y) convention, x is the fast index (as in CUDA). + * + * :param dst: destination array + * :param src: source array + * :param dst_width: width of the dst array + * :param src_width: width of the src array + * :param dst_offset: tuple with the offset (x, y) in the dst array + * :param src_offset: tuple with the offset (x, y) in the src array + * :param transfer_shape: shape of the transfer array in the form (x, y) + * + */ +kernel void cpy2d( + global float* dst, + global float* src, + int dst_width, + int src_width, + int2 dst_offset, + int2 src_offset, + int2 transfer_shape) +{ + int gidx = get_global_id(0), + gidy = get_global_id(1); + if (gidx < transfer_shape.x && gidy < transfer_shape.y) + { + dst[(dst_offset.y + gidy)*dst_width + (dst_offset.x + gidx)] = src[(src_offset.y + gidy)*src_width + (src_offset.x + gidx)]; + } +} + diff --git a/silx/resources/opencl/backproj.cl b/silx/resources/opencl/backproj.cl new file mode 100644 index 0000000..6fadc2c --- /dev/null +++ b/silx/resources/opencl/backproj.cl @@ -0,0 +1,485 @@ +/* + * Project: silx: filtered backprojection + * + * Copyright (C) 2016-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: A. Mirone + * P. Paleo + * + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +/*******************************************************************************/ +/************************ GPU VERSION (with textures) **************************/ +/*******************************************************************************/ + + +kernel void backproj_kernel( + int num_proj, + int num_bins, + float axis_position, + global float *d_SLICE, + read_only image2d_t d_sino, + float gpu_offset_x, + float gpu_offset_y, + global float * d_cos_s, // precalculated cos(theta[i]) + global float* d_sin_s, // precalculated sin(theta[i]) + global float* d_axis_s, // array of axis positions (n_projs) + local float* shared2) // 768B of local mem +{ + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; + const int tidx = get_local_id(0); //threadIdx.x; + const int bidx = get_group_id(0); //blockIdx.x; + const int tidy = get_local_id(1); //threadIdx.y; + const int bidy = get_group_id(1); //blockIdx.y; + + //~ local float shared[768]; + //~ float * sh_sin = shared; + //~ float * sh_cos = shared+256; + //~ float * sh_axis = sh_cos+256; + + local float sh_cos[256]; + local float sh_sin[256]; + local float sh_axis[256]; + + float pcos, psin; + float h0, h1, h2, h3; + const float apos_off_x= gpu_offset_x - axis_position ; + const float apos_off_y= gpu_offset_y - axis_position ; + float acorr05; + float res0 = 0, res1 = 0, res2 = 0, res3 = 0; + + const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ; + const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ; + + int read=0; + for(int proj=0; proj<num_proj; proj++) { + if(proj>=read) { + barrier(CLK_LOCAL_MEM_FENCE); + int ip = tidy*16+tidx; + if( read+ip < num_proj) { + sh_cos [ip] = d_cos_s[read+ip] ; + sh_sin [ip] = d_sin_s[read+ip] ; + sh_axis[ip] = d_axis_s[read+ip] ; + } + read=read+256; // 256=16*16 block size + barrier(CLK_LOCAL_MEM_FENCE); + } + pcos = sh_cos[256-read + proj] ; + psin = sh_sin[256-read + proj] ; + + acorr05 = sh_axis[256 - read + proj] ; + + h0 = (acorr05 + bx00*pcos - by00*psin); + h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin); + h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin); + h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin); + + if(h0>=0 && h0<num_bins) res0 += read_imagef(d_sino, sampler, (float2) (h0 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h0 +0.5f,proj +0.5f); + if(h1>=0 && h1<num_bins) res1 += read_imagef(d_sino, sampler, (float2) (h1 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h1 +0.5f,proj +0.5f); + if(h2>=0 && h2<num_bins) res2 += read_imagef(d_sino, sampler, (float2) (h2 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h2 +0.5f,proj +0.5f); + if(h3>=0 && h3<num_bins) res3 += read_imagef(d_sino, sampler, (float2) (h3 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h3 +0.5f,proj +0.5f); + } + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3; +} + + + + + +/*******************************************************************************/ +/********************* CPU VERSION (without textures) **************************/ +/*******************************************************************************/ + + +#define CLIP_MAX(x, N) (fmin(fmax(x, 0.0f), (N - 1.0f))) + +#define FLOORCEIL_x(x) {\ + xm = (int) floor(x);\ + xp = (int) ceil(x);\ +} + +#define ADJACENT_PIXELS_VALS(arr, Nx, y, xm, xp) ((float2) (arr[y*Nx+xm], arr[y*Nx+xp])) + +//Simple linear interpolator for working on the GPU +static float linear_interpolation(float2 vals, + float x, + int xm, + int xp) +{ + if (xm == xp) + return vals.s0; + else + return (vals.s0 * (xp - x)) + (vals.s1 * (x - xm)); +} + +/** + * + * Same kernel as backproj_kernel, but targets the CPU (no texture) + * +**/ +kernel void backproj_cpu_kernel( + int num_proj, + int num_bins, + float axis_position, + global float *d_SLICE, + global float* d_sino, + float gpu_offset_x, + float gpu_offset_y, + global float * d_cos_s, // precalculated cos(theta[i]) + global float * d_sin_s, // precalculated sin(theta[i]) + global float * d_axis_s, // array of axis positions (n_projs) + local float* shared2) // 768B of local mem +{ + const int tidx = get_local_id(0); //threadIdx.x; + const int bidx = get_group_id(0); //blockIdx.x; + const int tidy = get_local_id(1); //threadIdx.y; + const int bidy = get_group_id(1); //blockIdx.y; + + local float sh_cos[256]; + local float sh_sin[256]; + local float sh_axis[256]; + + float pcos, psin; + float h0, h1, h2, h3; + const float apos_off_x= gpu_offset_x - axis_position ; + const float apos_off_y= gpu_offset_y - axis_position ; + float acorr05; + float res0 = 0, res1 = 0, res2 = 0, res3 = 0; + + const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ; + const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ; + + int read=0; + for(int proj=0; proj<num_proj; proj++) { + if(proj>=read) { + barrier(CLK_LOCAL_MEM_FENCE); + int ip = tidy*16+tidx; + if( read+ip < num_proj) { + sh_cos [ip] = d_cos_s[read+ip] ; + sh_sin [ip] = d_sin_s[read+ip] ; + sh_axis[ip] = d_axis_s[read+ip] ; + } + read=read+256; // 256=16*16 block size + barrier(CLK_LOCAL_MEM_FENCE); + } + pcos = sh_cos[256-read + proj] ; + psin = sh_sin[256-read + proj] ; + + acorr05 = sh_axis[256 - read + proj] ; + + h0 = (acorr05 + bx00*pcos - by00*psin); + h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin); + h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin); + h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin); + + + float x; + int ym, xm, xp; + ym = proj; + float2 vals; + + if(h0>=0 && h0<num_bins) { + x = CLIP_MAX(h0, num_bins); + FLOORCEIL_x(x); + vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); + res0 += linear_interpolation(vals, x, xm, xp); + } + if(h1>=0 && h1<num_bins) { + x = CLIP_MAX(h1, num_bins); + FLOORCEIL_x(x); + vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); + res1 += linear_interpolation(vals, x, xm, xp); + } + if(h2>=0 && h2<num_bins) { + x = CLIP_MAX(h2, num_bins); + FLOORCEIL_x(x); + vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); + res2 += linear_interpolation(vals, x, xm, xp); + } + if(h3>=0 && h3<num_bins) { + x = CLIP_MAX(h3, num_bins); + FLOORCEIL_x(x); + vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); + res3 += linear_interpolation(vals, x, xm, xp); + } + } + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3; +} + + + + + + + +/*******************************************************************************/ +/************************** OLD STUFF, for tinkering **************************/ +/*******************************************************************************/ + + + + +/// arr(xm, ym), arr(xm, yp), arr(xp, yp), arr(xp, ym) +//~ #define ADJACENT_PIXELS_VALS2(arr, Nx, xm, xp, ym, yp) ((float4) (arr[ym*Nx + xm], arr[yp*Nx + xm], arr[yp*Nx + xp], arr[ym*Nx + xp])) + + +/** xm, xp, ym, yp **/ +//~ #define ADJACENT_PIXELS_COORDS(x, y) ((int4)((int) floor(x), (int) ceil(x), (int) floor(y), (int) ceil(y))) + +/** + (xm, ym) (xp, ym) + (x, y) + (xm, yp) (xp, yp) +**/ +/// arr(xm, ym), arr(xm, yp), arr(xp, yp), arr(xp, ym) +//~ #define ADJACENT_PIXELS_VALS(arr, Nx, coords) ((float4) (arr[coords.s2*Nx + coords.s0], arr[coords.s3*Nx + coords.s0], arr[coords.s3*Nx + coords.s1], arr[coords.s2*Nx + coords.s1])) + + +/** xm, xp **/ +//~ #define ADJACENT_PIXELS_COORDS2(x) ((int2)((int) floor(x), (int) ceil(x))) + + + +/* +float bilinear_interpolation( + float x, // x position in the image + float y, // y position in the image + int Nx, // image width + int Ny, // image height + int4 adj_coords, + float4 adj_vals +) { + float val; + float tol = 0.001f; // CHECKME + val = y - adj_coords.s2; + if ((x - adj_coords.s0) < tol && (y - adj_coords.s2) < tol) val = adj_vals.s0; + else if ((adj_coords.s1 - x) < tol && (adj_coords.s3 - y) < tol) val = adj_vals.s2; + else { + // Mirror - TODO: clamp ? + if (adj_coords.s0 < 0) adj_coords.s0 = 0; + if (adj_coords.s1 >= Nx) adj_coords.s1 = Nx - 1; + if (adj_coords.s2 < 0) adj_coords.s2 = 0; + if (adj_coords.s3 >= Ny) adj_coords.s3 = Ny -1; + if (adj_coords.s0 >= Nx) adj_coords.s0 = Nx - 1; + if (adj_coords.s2 >= Ny) adj_coords.s2 = Ny -1; + // Interp + val = adj_vals.s1*(adj_coords.s1-x)*(y-adj_coords.s2) + + adj_vals.s2 *(x-adj_coords.s0)*(y-adj_coords.s2) + + adj_vals.s0 *(adj_coords.s1-x)*(adj_coords.s3-y) + + adj_vals.s3 *(x-adj_coords.s0)*(adj_coords.s3-y); + + } + return val; +} +*/ + + +/* +__kernel void backproj_cpu_kernel_good( + int num_proj, + int num_bins, + float axis_position, + __global float *d_SLICE, + __global float* d_sino, + float gpu_offset_x, + float gpu_offset_y, + __global float * d_cos_s, // precalculated cos(theta[i]) + __global float * d_sin_s, // precalculated sin(theta[i]) + __global float * d_axis_s, // array of axis positions (n_projs) + __local float* shared2) // 768B of local mem +{ + const int tidx = get_local_id(0); //threadIdx.x; + const int bidx = get_group_id(0); //blockIdx.x; + const int tidy = get_local_id(1); //threadIdx.y; + const int bidy = get_group_id(1); //blockIdx.y; + + //~ __local float shared[768]; + //~ float * sh_sin = shared; + //~ float * sh_cos = shared+256; + //~ float * sh_axis = sh_cos+256; + + __local float sh_cos[256]; + __local float sh_sin[256]; + __local float sh_axis[256]; + + float pcos, psin; + float h0, h1, h2, h3; + const float apos_off_x= gpu_offset_x - axis_position ; + const float apos_off_y= gpu_offset_y - axis_position ; + float acorr05; + float res0 = 0, res1 = 0, res2 = 0, res3 = 0; + + const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ; + const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ; + + int read=0; + for(int proj=0; proj<num_proj; proj++) { + if(proj>=read) { + barrier(CLK_LOCAL_MEM_FENCE); + int ip = tidy*16+tidx; + if( read+ip < num_proj) { + sh_cos [ip] = d_cos_s[read+ip] ; + sh_sin [ip] = d_sin_s[read+ip] ; + sh_axis[ip] = d_axis_s[read+ip] ; + } + read=read+256; // 256=16*16 block size + barrier(CLK_LOCAL_MEM_FENCE); + } + pcos = sh_cos[256-read + proj] ; + psin = sh_sin[256-read + proj] ; + + acorr05 = sh_axis[256 - read + proj] ; + + h0 = (acorr05 + bx00*pcos - by00*psin); + h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin); + h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin); + h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin); + + + float x, val; + float tol = 0.001f; // CHECKME + float y = proj + 0.5f; + int ym = (int) floor(y); + int yp = (int) ceil(y); + int xm, xp; + + // + int i0, i1, j0, j1; + float d0, d1, x0, x1, y0, y1; + d0 = fmin(fmax(proj+0*0.5f, 0.0f), (num_proj - 1.0f)); + x0 = floor(d0); + x1 = ceil(d0); + i0 = (int) x0; + i1 = (int) x1; + + if(h0>=0 && h0<num_bins) { + d1 = fmin(fmax(h0+0*0.5f, 0.0f), (num_bins - 1.0f)); + y0 = floor(d1); + y1 = ceil(d1); + j0 = (int) y0; + j1 = (int) y1; + + if ((i0 == i1) && (j0 == j1)) + val = d_sino[i0*num_bins + j0]; //self.data[i0, j0] + else if (i0 == i1) + val = (d_sino[i0*num_bins + j0] * (y1 - d1)) + (d_sino[i0*num_bins + j1] * (d1 - y0)); // self.data[i0, j0], self.data[i0, j1] + else if (j0 == j1) + val = (d_sino[i0*num_bins + j0] * (x1 - d0)) + (d_sino[i1*num_bins + j0] * (d0 - x0)); // i0, j0 ; i1, j0 + else + val = (d_sino[i0*num_bins + j0] * (x1 - d0) * (y1 - d1)) // i0, j0 + + (d_sino[i1*num_bins + j0] * (d0 - x0) * (y1 - d1)) // i1, j0 + + (d_sino[i0*num_bins + j1] * (x1 - d0) * (d1 - y0)) // i0, j1 + + (d_sino[i1*num_bins + j1] * (d0 - x0) * (d1 - y0)); // i1, j1 + + res0 += val; + } + if(h1>=0 && h1<num_bins) { + //~ int4 coords = ADJACENT_PIXELS_COORDS(h1 +0.5f, proj +0.5f); + //~ res1 += bilinear_interpolation(h1 +0.5f, proj +0.5f, num_bins, num_proj, coords, ADJACENT_PIXELS_VALS(d_sino, num_bins, coords)); //tex2D(texProjes,h1 +0.5f,proj +0.5f); + d1 = fmin(fmax(h1+0*0.5f, 0.0f), (num_bins - 1.0f)); + y0 = floor(d1); + y1 = ceil(d1); + j0 = (int) y0; + j1 = (int) y1; + + if ((i0 == i1) && (j0 == j1)) + val = d_sino[i0*num_bins + j0]; //self.data[i0, j0] + else if (i0 == i1) + val = (d_sino[i0*num_bins + j0] * (y1 - d1)) + (d_sino[i0*num_bins + j1] * (d1 - y0)); // self.data[i0, j0], self.data[i0, j1] + else if (j0 == j1) + val = (d_sino[i0*num_bins + j0] * (x1 - d0)) + (d_sino[i1*num_bins + j0] * (d0 - x0)); // i0, j0 ; i1, j0 + else + val = (d_sino[i0*num_bins + j0] * (x1 - d0) * (y1 - d1)) // i0, j0 + + (d_sino[i1*num_bins + j0] * (d0 - x0) * (y1 - d1)) // i1, j0 + + (d_sino[i0*num_bins + j1] * (x1 - d0) * (d1 - y0)) // i0, j1 + + (d_sino[i1*num_bins + j1] * (d0 - x0) * (d1 - y0)); // i1, j1 + + + res1 += val; + } + if(h2>=0 && h2<num_bins) { + //~ int4 coords = ADJACENT_PIXELS_COORDS(h2 +0.5f, proj +0.5f); + //~ res2 += 0; //bilinear_interpolation(h2 +0.5f, proj +0.5f, num_bins, num_proj, coords, ADJACENT_PIXELS_VALS(d_sino, num_bins, coords)); //tex2D(texProjes,h2 +0.5f,proj +0.5f); + d1 = fmin(fmax(h2+0*0.5f, 0.0f), (num_bins - 1.0f)); + y0 = floor(d1); + y1 = ceil(d1); + j0 = (int) y0; + j1 = (int) y1; + + if ((i0 == i1) && (j0 == j1)) + val = d_sino[i0*num_bins + j0]; //self.data[i0, j0] + else if (i0 == i1) + val = (d_sino[i0*num_bins + j0] * (y1 - d1)) + (d_sino[i0*num_bins + j1] * (d1 - y0)); // self.data[i0, j0], self.data[i0, j1] + else if (j0 == j1) + val = (d_sino[i0*num_bins + j0] * (x1 - d0)) + (d_sino[i1*num_bins + j0] * (d0 - x0)); // i0, j0 ; i1, j0 + else + val = (d_sino[i0*num_bins + j0] * (x1 - d0) * (y1 - d1)) // i0, j0 + + (d_sino[i1*num_bins + j0] * (d0 - x0) * (y1 - d1)) // i1, j0 + + (d_sino[i0*num_bins + j1] * (x1 - d0) * (d1 - y0)) // i0, j1 + + (d_sino[i1*num_bins + j1] * (d0 - x0) * (d1 - y0)); // i1, j1 + + res2+= val; + } + if(h3>=0 && h3<num_bins) { + //~ int4 coords = ADJACENT_PIXELS_COORDS(h3 +0.5f, proj +0.5f); + //~ res3 += 0; //bilinear_interpolation(h3 +0.5f, proj +0.5f, num_bins, num_proj, coords, ADJACENT_PIXELS_VALS(d_sino, num_bins, coords)); //tex2D(texProjes,h3 +0.5f,proj +0.5f); + d1 = fmin(fmax(h3+0*0.5f, 0.0f), (num_bins - 1.0f)); + y0 = floor(d1); + y1 = ceil(d1); + j0 = (int) y0; + j1 = (int) y1; + + if ((i0 == i1) && (j0 == j1)) + val = d_sino[i0*num_bins + j0]; //self.data[i0, j0] + else if (i0 == i1) + val = (d_sino[i0*num_bins + j0] * (y1 - d1)) + (d_sino[i0*num_bins + j1] * (d1 - y0)); // self.data[i0, j0], self.data[i0, j1] + else if (j0 == j1) + val = (d_sino[i0*num_bins + j0] * (x1 - d0)) + (d_sino[i1*num_bins + j0] * (d0 - x0)); // i0, j0 ; i1, j0 + else + val = (d_sino[i0*num_bins + j0] * (x1 - d0) * (y1 - d1)) // i0, j0 + + (d_sino[i1*num_bins + j0] * (d0 - x0) * (y1 - d1)) // i1, j0 + + (d_sino[i0*num_bins + j1] * (x1 - d0) * (d1 - y0)) // i0, j1 + + (d_sino[i1*num_bins + j1] * (d0 - x0) * (d1 - y0)); // i1, j1 + + res3 += val; + } + } + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3; +} +*/ + + + + diff --git a/silx/resources/opencl/backproj_helper.cl b/silx/resources/opencl/backproj_helper.cl new file mode 100644 index 0000000..b1590f8 --- /dev/null +++ b/silx/resources/opencl/backproj_helper.cl @@ -0,0 +1,68 @@ +/* + * Project: silx: backprojection helper functions + * + * Copyright (C) 2016-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: P. Paleo + * J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +kernel void mult( global float2* d_sino, + global float2* d_filter, + int num_bins, + int num_projs) +{ + int gid0 = get_global_id(0); + int gid1 = get_global_id(1); + if (gid0 < num_bins && gid1 < num_projs) + { + // d_sino[gid1*num_bins+gid0] *= d_filter[gid0]; + d_sino[gid1*num_bins+gid0].x *= d_filter[gid0].x; + d_sino[gid1*num_bins+gid0].y *= d_filter[gid0].x; + } +} + +// copy only the real part of the valid data to the real array +kernel void cpy2d_c2r( + global float* d_sino, + global float2* d_sino_complex, + int num_bins, + int num_projs, + int fft_size) +{ + int gid0 = get_global_id(0); + int gid1 = get_global_id(1); + if (gid0 < num_bins && gid1 < num_projs) { + d_sino[gid1*num_bins+gid0] = d_sino_complex[gid1*fft_size+gid0].x; + } +} + + + + + + + diff --git a/silx/resources/opencl/linalg.cl b/silx/resources/opencl/linalg.cl new file mode 100644 index 0000000..82a76eb --- /dev/null +++ b/silx/resources/opencl/linalg.cl @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +/** + * + * Compute the spatial gradient of an image. + * + * slice: input image + * slice_grad: output gradient + * sizeX: number of columns of the image + * sizeY: number of rows of the image + * + **/ +__kernel void kern_gradient2D( + __global float* slice, + __global float2* slice_grad, + int sizeX, + int sizeY) +{ + + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + float val_x = 0, val_y = 0; + + if (gidx < sizeX && gidy < sizeY) { + if (gidx == sizeX-1) val_y = 0; + else val_y = slice[(gidy)*sizeX+gidx+1] - slice[(gidy)*sizeX+gidx]; + if (gidy == sizeY-1) val_x = 0; + else val_x = slice[(gidy+1)*sizeX+gidx] - slice[(gidy)*sizeX+gidx]; + + slice_grad[(gidy)*sizeX+gidx].x = val_x; + slice_grad[(gidy)*sizeX+gidx].y = val_y; + } +} + +/** + * + * Compute the spatial divergence of an image gradient. + * + * slice_grad: input gradient-like image + * slice: output image + * sizeX: number of columns of the input + * sizeY: number of rows of the input + * + **/ +__kernel void kern_divergence2D( + __global float2* slice_grad, + __global float* slice, + int sizeX, + int sizeY) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + float val_x = 0, val_y = 0; + + if (gidx < sizeX && gidy < sizeY) { + if (gidx == 0) val_y = slice_grad[(gidy)*sizeX+gidx].y; + else val_y = slice_grad[(gidy)*sizeX+gidx].y - slice_grad[(gidy)*sizeX+gidx-1].y; + if (gidy == 0) val_x = slice_grad[(gidy)*sizeX+gidx].x; + else val_x = slice_grad[(gidy)*sizeX+gidx].x - slice_grad[(gidy-1)*sizeX+gidx].x; + slice[(gidy)*sizeX+gidx] = val_x + val_y; + } +} + + + + diff --git a/silx/resources/opencl/proj.cl b/silx/resources/opencl/proj.cl new file mode 100644 index 0000000..afc58ff --- /dev/null +++ b/silx/resources/opencl/proj.cl @@ -0,0 +1,345 @@ +/* + * Copyright (C) 2017-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Based on the projector of PyHST2 - https://forge.epn-campus.eu/projects/pyhst2 + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +/*******************************************************************************/ +/************************ GPU VERSION (with textures) **************************/ +/*******************************************************************************/ + + +kernel void forward_kernel( + global float *d_Sino, + read_only image2d_t d_slice, + int dimslice, + int num_bins, + global float* angles_per_project , + float axis_position, + global float *d_axis_corrections, + global int* d_beginPos , + global int* d_strideJoseph, + global int* d_strideLine , + int num_projections, + int dimrecx, + int dimrecy, + float cpu_offset_x, + float cpu_offset_y, + int josephnoclip, + int normalize) +{ + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; + const int tidx = get_local_id(0); + const int bidx = get_group_id(0); + const int tidy = get_local_id(1); + const int bidy = get_group_id(1); + float angle; + float cos_angle,sin_angle ; + + local float corrections[16]; + local int beginPos[16*2]; + local int strideJoseph[16*2]; + local int strideLine[16*2]; + + // thread will use corrections[tidy] + // All are read by first warp + int offset, OFFSET; + switch(tidy) { + case 0: + corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx]; + break; + case 1: + case 2: + offset = 16*(tidy-1); + OFFSET = dimrecy*(tidy-1); + beginPos [offset + tidx ]= d_beginPos[ OFFSET+ bidy*16+tidx] ; + break; + case 3: + case 4: + offset = 16*(tidy-3); + OFFSET = dimrecy*(tidy-3); + strideJoseph[offset + tidx ]= d_strideJoseph[OFFSET + bidy*16+tidx] ; + break; + case 5: + case 6: + offset = 16*(tidy-5); + OFFSET = dimrecy*(tidy-5); + strideLine[offset + tidx ]= d_strideLine[OFFSET + bidy*16+tidx] ; + break; + } + barrier(CLK_LOCAL_MEM_FENCE); + + angle = angles_per_project[ bidy*16+tidy ] ; + cos_angle = cos(angle); + sin_angle = sin(angle); + + if(fabs(cos_angle) > 0.70710678f ) { + if( cos_angle>0) { + cos_angle = cos(angle); + sin_angle = sin(angle); + } + else { + cos_angle = -cos(angle); + sin_angle = -sin(angle); + } + } + else { + if( sin_angle>0) { + cos_angle = sin(angle); + sin_angle = -cos(angle); + } + else { + cos_angle = -sin(angle); + sin_angle = cos(angle); + } + } + float res=0.0f; + float axis_corr = axis_position + corrections[ tidy ]; + float axis = axis_position ; + float xpix = ( bidx*16+tidx )-cpu_offset_x; + float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ; + + float shiftJ = sin_angle/cos_angle; + float x1 = min(-sin_angle/cos_angle ,0.f); + float x2 = max(-sin_angle/cos_angle ,0.f); + + float Area; + Area=1.0f/cos_angle; + int stlA, stlB , stlAJ, stlBJ ; + stlA=strideLine[16+tidy]; + stlB=strideLine[tidy]; + stlAJ=strideJoseph[16+tidy]; + stlBJ=strideJoseph[tidy]; + + int beginA = beginPos[16+tidy ]; + int beginB = beginPos[tidy ]; + float add; + int l; + + if(josephnoclip) { + for(int j=0; j<dimslice; j++) { // j: Joseph + x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f; + x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f; + add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2); + res += add; + posx += shiftJ; + } + } + else { + for(int j=0; j<dimslice; j++) { // j: Joseph + x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f; + x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f; + l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ; + add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2); + res += add*l; + posx += shiftJ; + } + } + + if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) { + res *= Area; + if (normalize) + res *= M_PI_F * 0.5f / num_projections; + d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res; + } +} + + + +/*******************************************************************************/ +/********************* CPU VERSION (without textures) **************************/ +/*******************************************************************************/ + + +kernel void forward_kernel_cpu( + global float *d_Sino, + global float* d_slice, + int dimslice, + int num_bins, + global float* angles_per_project , + float axis_position, + global float *d_axis_corrections, + global int* d_beginPos , + global int* d_strideJoseph, + global int* d_strideLine , + int num_projections, + int dimrecx, + int dimrecy, + float cpu_offset_x, + float cpu_offset_y, + int josephnoclip, + int normalize) +{ + + const int tidx = get_local_id(0); + const int bidx = get_group_id(0); + const int tidy = get_local_id(1); + const int bidy = get_group_id(1); + float angle; + float cos_angle,sin_angle ; + + local float corrections[16]; + local int beginPos[16*2]; + local int strideJoseph[16*2]; + local int strideLine[16*2]; + + // thread will use corrections[tidy] + // All are read by first warp + int offset, OFFSET; + switch(tidy) { + case 0: + corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx]; + break; + case 1: + case 2: + offset = 16*(tidy-1); + OFFSET = dimrecy*(tidy-1); + beginPos [offset + tidx ]= d_beginPos[ OFFSET+ bidy*16+tidx] ; + break; + case 3: + case 4: + offset = 16*(tidy-3); + OFFSET = dimrecy*(tidy-3); + strideJoseph[offset + tidx ]= d_strideJoseph[OFFSET + bidy*16+tidx] ; + break; + case 5: + case 6: + offset = 16*(tidy-5); + OFFSET = dimrecy*(tidy-5); + strideLine[offset + tidx ]= d_strideLine[OFFSET + bidy*16+tidx] ; + break; + } + barrier(CLK_LOCAL_MEM_FENCE); + + angle = angles_per_project[ bidy*16+tidy ] ; + cos_angle = cos(angle); + sin_angle = sin(angle); + + if(fabs(cos_angle) > 0.70710678f ) { + if( cos_angle>0) { + cos_angle = cos(angle); + sin_angle = sin(angle); + } + else { + cos_angle = -cos(angle); + sin_angle = -sin(angle); + } + } + else { + if( sin_angle>0) { + cos_angle = sin(angle); + sin_angle = -cos(angle); + } + else { + cos_angle = -sin(angle); + sin_angle = cos(angle); + } + } + float res=0.0f; + float axis_corr = axis_position + corrections[ tidy ]; + float axis = axis_position ; + float xpix = ( bidx*16+tidx )-cpu_offset_x; + float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ; + + float shiftJ = sin_angle/cos_angle; + float x1 = min(-sin_angle/cos_angle ,0.f); + float x2 = max(-sin_angle/cos_angle ,0.f); + + float Area; + Area=1.0f/cos_angle; + int stlA, stlB , stlAJ, stlBJ ; + stlA=strideLine[16+tidy]; + stlB=strideLine[tidy]; + stlAJ=strideJoseph[16+tidy]; + stlBJ=strideJoseph[tidy]; + + int beginA = beginPos[16+tidy ]; + int beginB = beginPos[tidy ]; + int l; + + int ym, yp, xm, xp; + float yc, xc; + float val; + if(josephnoclip) { + for(int j=0; j<dimslice; j++) { // j: Joseph + x1 = beginA +(posx)*stlA + (j)*stlAJ+1.0f; + x2 = beginB +(posx)*stlB + (j)*stlBJ+1.0f; + /* + Bilinear interpolation + */ + yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped + ym = (int) floor(yc); // y_minus + yp = (int) ceil(yc); // y_plus + + xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f)); // x_clipped + xm = (int) floor(xc); // x_minus + xp = (int) ceil(xc); // x_plus + + if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm]; + else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm)); + else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym)); + else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc)) + + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc)) + + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm)) + + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm)); + // ---------- + res += val; + posx += shiftJ; + } + } + else { + for(int j=0; j<dimslice; j++) { // j: Joseph + x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f; + x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f; + l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ; + /* + Bilinear interpolation + */ + yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped + ym = (int) floor(yc); // y_minus + yp = (int) ceil(yc); // y_plus + + xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f)); // x_clipped + xm = (int) floor(xc); // x_minus + xp = (int) ceil(xc); // x_plus + + if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm]; + else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm)); + else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym)); + else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc)) + + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc)) + + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm)) + + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm)); + // ---------- + res += val*l; + posx += shiftJ; + } + } + + if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) { + res *= Area; + if (normalize) + res *= M_PI_F * 0.5f / num_projections; + d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res; + } +} |