diff options
Diffstat (limited to 'silx/resources/opencl')
-rw-r--r-- | silx/resources/opencl/addition.cl | 42 | ||||
-rw-r--r-- | silx/resources/opencl/array_utils.cl | 73 | ||||
-rw-r--r-- | silx/resources/opencl/backproj.cl | 232 | ||||
-rw-r--r-- | silx/resources/opencl/backproj_helper.cl | 68 | ||||
-rw-r--r-- | silx/resources/opencl/bitonic.cl | 569 | ||||
-rw-r--r-- | silx/resources/opencl/codec/byte_offset.cl | 235 | ||||
-rw-r--r-- | silx/resources/opencl/convolution.cl | 312 | ||||
-rw-r--r-- | silx/resources/opencl/convolution_textures.cl | 374 | ||||
-rw-r--r-- | silx/resources/opencl/doubleword.cl | 115 | ||||
-rw-r--r-- | silx/resources/opencl/image/cast.cl | 181 | ||||
-rw-r--r-- | silx/resources/opencl/image/histogram.cl | 178 | ||||
-rw-r--r-- | silx/resources/opencl/image/map.cl | 85 | ||||
-rw-r--r-- | silx/resources/opencl/image/max_min.cl | 207 | ||||
-rw-r--r-- | silx/resources/opencl/kahan.cl | 143 | ||||
-rw-r--r-- | silx/resources/opencl/linalg.cl | 88 | ||||
-rw-r--r-- | silx/resources/opencl/medfilt.cl | 141 | ||||
-rw-r--r-- | silx/resources/opencl/preprocess.cl | 567 | ||||
-rw-r--r-- | silx/resources/opencl/proj.cl | 345 | ||||
-rw-r--r-- | silx/resources/opencl/sparse.cl | 94 | ||||
-rw-r--r-- | silx/resources/opencl/statistics.cl | 283 |
20 files changed, 0 insertions, 4332 deletions
diff --git a/silx/resources/opencl/addition.cl b/silx/resources/opencl/addition.cl deleted file mode 100644 index 35d7996..0000000 --- a/silx/resources/opencl/addition.cl +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Project: SIFT: An algorithm for image alignement - * - * Copyright (C) 2013-2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: J. Kieffer (kieffer@esrf.fr) - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - - -// "Hello_world" kernel to test if OpenCL is actually working -kernel void addition(global float* a, - global float* b, - global float* res, - int N) -{ - int i = get_global_id(0); - if( i<N ){ - res[i] = a[i] + b[i]; - } -} diff --git a/silx/resources/opencl/array_utils.cl b/silx/resources/opencl/array_utils.cl deleted file mode 100644 index 6f78921..0000000 --- a/silx/resources/opencl/array_utils.cl +++ /dev/null @@ -1,73 +0,0 @@ -/** - * 2D Memcpy for float* arrays, - * replacing pyopencl "enqueue_copy" which does not work for rectangular copies. - * ALL THE SIZES/OFFSETS ARE SPECIFIED IN PIXELS, NOT IN BYTES. - * In the (x, y) convention, x is the fast index (as in CUDA). - * - * :param dst: destination array - * :param src: source array - * :param dst_width: width of the dst array - * :param src_width: width of the src array - * :param dst_offset: tuple with the offset (x, y) in the dst array - * :param src_offset: tuple with the offset (x, y) in the src array - * :param transfer_shape: shape of the transfer array in the form (x, y) - * - */ -kernel void cpy2d( - global float* dst, - global float* src, - int dst_width, - int src_width, - int2 dst_offset, - int2 src_offset, - int2 transfer_shape) -{ - int gidx = get_global_id(0), gidy = get_global_id(1); - if (gidx < transfer_shape.x && gidy < transfer_shape.y) { - dst[(dst_offset.y + gidy)*dst_width + (dst_offset.x + gidx)] = src[(src_offset.y + gidy)*src_width + (src_offset.x + gidx)]; - } -} - - -// Looks like cfloat_t and cfloat_mul are not working, yet specified in -// pyopencl documentation. Here we are using float2 as in all available examples -// #include <pyopencl-complex.h> -// typedef cfloat_t complex; - -static inline float2 complex_mul(float2 a, float2 b) { - float2 res = (float2) (0, 0); - res.x = a.x * b.x - a.y * b.y; - res.y = a.y * b.x + a.x * b.y; - return res; -} - -// arr2D *= arr1D (line by line, i.e along fast dim) -kernel void inplace_complex_mul_2Dby1D( - global float2* arr2D, - global float2* arr1D, - int width, - int height) -{ - int x = get_global_id(0); - int y = get_global_id(1); - if ((x >= width) || (y >= height)) return; - int i = y*width + x; - arr2D[i] = complex_mul(arr2D[i], arr1D[x]); -} - - -// arr3D *= arr1D (along fast dim) -kernel void inplace_complex_mul_3Dby1D( - global float2* arr3D, - global float2* arr1D, - int width, - int height, - int depth) -{ - int x = get_global_id(0); - int y = get_global_id(1); - int z = get_global_id(2); - if ((x >= width) || (y >= height) || (z >= depth)) return; - int i = (z*height + y)*width + x; - arr3D[i] = complex_mul(arr3D[i], arr1D[x]); -} diff --git a/silx/resources/opencl/backproj.cl b/silx/resources/opencl/backproj.cl deleted file mode 100644 index da15131..0000000 --- a/silx/resources/opencl/backproj.cl +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Project: silx: filtered backprojection - * - * Copyright (C) 2016-2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: A. Mirone - * P. Paleo - * - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - - -/*******************************************************************************/ -/************************ GPU VERSION (with textures) **************************/ -/*******************************************************************************/ - -#ifndef DONT_USE_TEXTURES -kernel void backproj_kernel( - int num_proj, - int num_bins, - float axis_position, - global float *d_SLICE, - read_only image2d_t d_sino, - float gpu_offset_x, - float gpu_offset_y, - global float * d_cos_s, // precalculated cos(theta[i]) - global float* d_sin_s, // precalculated sin(theta[i]) - global float* d_axis_s, // array of axis positions (n_projs) - local float* shared2) // 768B of local mem -{ - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; - const int tidx = get_local_id(0); //threadIdx.x; - const int bidx = get_group_id(0); //blockIdx.x; - const int tidy = get_local_id(1); //threadIdx.y; - const int bidy = get_group_id(1); //blockIdx.y; - - local float sh_cos[256]; - local float sh_sin[256]; - local float sh_axis[256]; - - float pcos, psin; - float h0, h1, h2, h3; - const float apos_off_x= gpu_offset_x - axis_position ; - const float apos_off_y= gpu_offset_y - axis_position ; - float acorr05; - float res0 = 0, res1 = 0, res2 = 0, res3 = 0; - - const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ; - const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ; - - int read=0; - for(int proj=0; proj<num_proj; proj++) { - if(proj>=read) { - barrier(CLK_LOCAL_MEM_FENCE); - int ip = tidy*16+tidx; - if( read+ip < num_proj) { - sh_cos [ip] = d_cos_s[read+ip] ; - sh_sin [ip] = d_sin_s[read+ip] ; - sh_axis[ip] = d_axis_s[read+ip] ; - } - read=read+256; // 256=16*16 block size - barrier(CLK_LOCAL_MEM_FENCE); - } - pcos = sh_cos[256-read + proj] ; - psin = sh_sin[256-read + proj] ; - - acorr05 = sh_axis[256 - read + proj] ; - - h0 = (acorr05 + bx00*pcos - by00*psin); - h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin); - h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin); - h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin); - - if(h0>=0 && h0<num_bins) res0 += read_imagef(d_sino, sampler, (float2) (h0 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h0 +0.5f,proj +0.5f); - if(h1>=0 && h1<num_bins) res1 += read_imagef(d_sino, sampler, (float2) (h1 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h1 +0.5f,proj +0.5f); - if(h2>=0 && h2<num_bins) res2 += read_imagef(d_sino, sampler, (float2) (h2 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h2 +0.5f,proj +0.5f); - if(h3>=0 && h3<num_bins) res3 += read_imagef(d_sino, sampler, (float2) (h3 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h3 +0.5f,proj +0.5f); - } - d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0; - d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1; - d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2; - d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3; -} -#endif - - - - -/*******************************************************************************/ -/********************* CPU VERSION (without textures) **************************/ -/*******************************************************************************/ - - -#define CLIP_MAX(x, N) (fmin(fmax(x, 0.0f), (N - 1.0f))) - -#define FLOORCEIL_x(x) {\ - xm = (int) floor(x);\ - xp = (int) ceil(x);\ -} - -#define ADJACENT_PIXELS_VALS(arr, Nx, y, xm, xp) ((float2) (arr[y*Nx+xm], arr[y*Nx+xp])) - -//Simple linear interpolator for working on the GPU -static float linear_interpolation(float2 vals, - float x, - int xm, - int xp) -{ - if (xm == xp) - return vals.s0; - else - return (vals.s0 * (xp - x)) + (vals.s1 * (x - xm)); -} - -/** - * - * Same kernel as backproj_kernel, but targets the CPU (no texture) - * -**/ -kernel void backproj_cpu_kernel( - int num_proj, - int num_bins, - float axis_position, - global float *d_SLICE, - global float* d_sino, - float gpu_offset_x, - float gpu_offset_y, - global float * d_cos_s, // precalculated cos(theta[i]) - global float * d_sin_s, // precalculated sin(theta[i]) - global float * d_axis_s, // array of axis positions (n_projs) - local float* shared2) // 768B of local mem -{ - const int tidx = get_local_id(0); //threadIdx.x; - const int bidx = get_group_id(0); //blockIdx.x; - const int tidy = get_local_id(1); //threadIdx.y; - const int bidy = get_group_id(1); //blockIdx.y; - - local float sh_cos[256]; - local float sh_sin[256]; - local float sh_axis[256]; - - float pcos, psin; - float h0, h1, h2, h3; - const float apos_off_x= gpu_offset_x - axis_position ; - const float apos_off_y= gpu_offset_y - axis_position ; - float acorr05; - float res0 = 0, res1 = 0, res2 = 0, res3 = 0; - - const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ; - const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ; - - int read=0; - for(int proj=0; proj<num_proj; proj++) { - if(proj>=read) { - barrier(CLK_LOCAL_MEM_FENCE); - int ip = tidy*16+tidx; - if( read+ip < num_proj) { - sh_cos [ip] = d_cos_s[read+ip] ; - sh_sin [ip] = d_sin_s[read+ip] ; - sh_axis[ip] = d_axis_s[read+ip] ; - } - read=read+256; // 256=16*16 block size - barrier(CLK_LOCAL_MEM_FENCE); - } - pcos = sh_cos[256-read + proj] ; - psin = sh_sin[256-read + proj] ; - - acorr05 = sh_axis[256 - read + proj] ; - - h0 = (acorr05 + bx00*pcos - by00*psin); - h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin); - h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin); - h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin); - - - float x; - int ym, xm, xp; - ym = proj; - float2 vals; - - if(h0>=0 && h0<num_bins) { - x = CLIP_MAX(h0, num_bins); - FLOORCEIL_x(x); - vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); - res0 += linear_interpolation(vals, x, xm, xp); - } - if(h1>=0 && h1<num_bins) { - x = CLIP_MAX(h1, num_bins); - FLOORCEIL_x(x); - vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); - res1 += linear_interpolation(vals, x, xm, xp); - } - if(h2>=0 && h2<num_bins) { - x = CLIP_MAX(h2, num_bins); - FLOORCEIL_x(x); - vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); - res2 += linear_interpolation(vals, x, xm, xp); - } - if(h3>=0 && h3<num_bins) { - x = CLIP_MAX(h3, num_bins); - FLOORCEIL_x(x); - vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); - res3 += linear_interpolation(vals, x, xm, xp); - } - } - d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0; - d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1; - d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2; - d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3; -} - diff --git a/silx/resources/opencl/backproj_helper.cl b/silx/resources/opencl/backproj_helper.cl deleted file mode 100644 index b1590f8..0000000 --- a/silx/resources/opencl/backproj_helper.cl +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Project: silx: backprojection helper functions - * - * Copyright (C) 2016-2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: P. Paleo - * J. Kieffer (kieffer@esrf.fr) - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - - -kernel void mult( global float2* d_sino, - global float2* d_filter, - int num_bins, - int num_projs) -{ - int gid0 = get_global_id(0); - int gid1 = get_global_id(1); - if (gid0 < num_bins && gid1 < num_projs) - { - // d_sino[gid1*num_bins+gid0] *= d_filter[gid0]; - d_sino[gid1*num_bins+gid0].x *= d_filter[gid0].x; - d_sino[gid1*num_bins+gid0].y *= d_filter[gid0].x; - } -} - -// copy only the real part of the valid data to the real array -kernel void cpy2d_c2r( - global float* d_sino, - global float2* d_sino_complex, - int num_bins, - int num_projs, - int fft_size) -{ - int gid0 = get_global_id(0); - int gid1 = get_global_id(1); - if (gid0 < num_bins && gid1 < num_projs) { - d_sino[gid1*num_bins+gid0] = d_sino_complex[gid1*fft_size+gid0].x; - } -} - - - - - - - diff --git a/silx/resources/opencl/bitonic.cl b/silx/resources/opencl/bitonic.cl deleted file mode 100644 index 4096ce8..0000000 --- a/silx/resources/opencl/bitonic.cl +++ /dev/null @@ -1,569 +0,0 @@ -/*############################################################################ -# Sort elements within a vector by Matthew Scarpino, -# Taken from his book "OpenCL in Action", -# November 2011 ISBN 9781617290176 -# Original license for the code: "public domain" -# -# Originally this code is public domain. The MIT license has been added -# by J. Kieffer (jerome.kieffer@esrf.eu) to provide a disclaimer. -# J. Kieffer does not claim authorship of this code developed by . -# -# Copyright (c) 2011 Matthew Scarpino -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# -#############################################################################*/ - -#define VECTOR_SORT_BOOK(input, dir) \ - comp = abs(input > shuffle(input, mask2)) ^ dir; \ - input = shuffle(input, comp * 2 + add2); \ - comp = abs(input > shuffle(input, mask1)) ^ dir; \ - input = shuffle(input, comp + add1); \ - - -#define VECTOR_SWAP_BOOK(in1, in2, dir) \ - input1 = in1; input2 = in2; \ - comp = (abs(input1 > input2) ^ dir) * 4 + add3; \ - in1 = shuffle2(input1, input2, comp); \ - in2 = shuffle2(input2, input1, comp); \ - - -// The _FILE extension correspond to the formula found in the "OpenCL in Action" supplementary files -#define VECTOR_SORT_FILE(input, dir)\ - comp = (input < shuffle(input, mask2)) ^ dir;\ - input = shuffle(input, as_uint4(comp * 2 + add2));\ - comp = (input < shuffle(input, mask1)) ^ dir;\ - input = shuffle(input, as_uint4(comp + add1));\ - - -#define VECTOR_SWAP_FILE(input1, input2, dir)\ - temp = input1;\ - comp = ((input1 < input2) ^ dir) * 4 + add3;\ - input1 = shuffle2(input1, input2, as_uint4(comp));\ - input2 = shuffle2(input2, temp, as_uint4(comp));\ - - - -// Functions to be called from an actual kernel. - -static float8 my_sort_file(uint local_id, uint group_id, uint local_size, - float8 input, local float4 *l_data) -{ - float4 input1, input2, temp; - float8 output; - - int dir; - uint id, size, stride; - int4 comp; - - uint4 mask1 = (uint4)(1, 0, 3, 2); - uint4 mask2 = (uint4)(2, 3, 0, 1); - uint4 mask3 = (uint4)(3, 2, 1, 0); - - int4 add1 = (int4)(1, 1, 3, 3); - int4 add2 = (int4)(2, 3, 2, 3); - int4 add3 = (int4)(1, 2, 2, 3); - - // retrieve input data - input1 = (float4)(input.s0, input.s1, input.s2, input.s3); - input2 = (float4)(input.s4, input.s5, input.s6, input.s7); - - // Find global address - id = local_id * 2; - - /* Sort input 1 - ascending */ - comp = input1 < shuffle(input1, mask1); - input1 = shuffle(input1, as_uint4(comp + add1)); - comp = input1 < shuffle(input1, mask2); - input1 = shuffle(input1, as_uint4(comp * 2 + add2)); - comp = input1 < shuffle(input1, mask3); - input1 = shuffle(input1, as_uint4(comp + add3)); - - /* Sort input 2 - descending */ - comp = input2 > shuffle(input2, mask1); - input2 = shuffle(input2, as_uint4(comp + add1)); - comp = input2 > shuffle(input2, mask2); - input2 = shuffle(input2, as_uint4(comp * 2 + add2)); - comp = input2 > shuffle(input2, mask3); - input2 = shuffle(input2, as_uint4(comp + add3)); - - /* Swap corresponding elements of input 1 and 2 */ - add3 = (int4)(4, 5, 6, 7); - dir = - (int) (local_id % 2); - temp = input1; - comp = ((input1 < input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, as_uint4(comp)); - input2 = shuffle2(input2, temp, as_uint4(comp)); - - /* Sort data and store in local memory */ - VECTOR_SORT_FILE(input1, dir); - VECTOR_SORT_FILE(input2, dir); - l_data[id] = input1; - l_data[id+1] = input2; - barrier(CLK_LOCAL_MEM_FENCE); - - /* Create bitonic set */ - for(size = 2; size < local_size; size <<= 1) { - dir = - (int) (local_id/size & 1) ; - - for(stride = size; stride > 1; stride >>= 1) { - barrier(CLK_LOCAL_MEM_FENCE); - id = local_id + (local_id/stride)*stride; - VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) - } - - barrier(CLK_LOCAL_MEM_FENCE); - id = local_id * 2; - input1 = l_data[id]; - input2 = l_data[id+1]; - temp = input1; - comp = ((input1 < input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, as_uint4(comp)); - input2 = shuffle2(input2, temp, as_uint4(comp)); - VECTOR_SORT_FILE(input1, dir); - VECTOR_SORT_FILE(input2, dir); - l_data[id] = input1; - l_data[id+1] = input2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - /* Perform bitonic merge */ - dir = - (int) (group_id % 2); - for(stride = local_size; stride > 1; stride >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - id = local_id + (local_id/stride)*stride; - VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) - } - barrier(CLK_LOCAL_MEM_FENCE); - - /* Perform final sort */ - id = local_id * 2; - input1 = l_data[id]; - input2 = l_data[id+1]; - temp = input1; - comp = ((input1 < input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, as_uint4(comp)); - input2 = shuffle2(input2, temp, as_uint4(comp)); - VECTOR_SORT_FILE(input1, dir); - VECTOR_SORT_FILE(input2, dir); - - // setup output and return it - output = (float8)(input1, input2); - return output; -} - -static float8 my_sort_book(uint local_id, uint group_id, uint local_size, - float8 input, local float4 *l_data) -{ - float4 input1, input2, temp; - float8 output; - uint4 comp, swap, mask1, mask2, add1, add2, add3; - uint id, dir, size, stride; - mask1 = (uint4)(1, 0, 3, 2); - swap = (uint4)(0, 0, 1, 1); - add1 = (uint4)(0, 0, 2, 2); - mask2 = (uint4)(2, 3, 0, 1); - add2 = (uint4)(0, 1, 0, 1); - add3 = (uint4)(0, 1, 2, 3); - - // retrieve input data - input1 = (float4)(input.s0, input.s1, input.s2, input.s3); - input2 = (float4)(input.s4, input.s5, input.s6, input.s7); - - // Find global address - id = local_id * 2; - - //Sort first vector - - comp = abs(input1 > shuffle(input1, mask1)); - input1 = shuffle(input1, comp ^ swap + add1); - comp = abs(input1 > shuffle(input1, mask2)); - input1 = shuffle(input1, comp * 2 + add2); - comp = abs(input1 > shuffle(input1, mask1)); - input1 = shuffle(input1, comp + add1); - - //Sort second vector - comp = abs(input2 < shuffle(input2, mask1)); - input2 = shuffle(input2, comp ^ swap + add1); - comp = abs(input2 < shuffle(input2, mask2)); - input2 = shuffle(input2, comp * 2 + add2); - comp = abs(input2 < shuffle(input2, mask1)); - input2 = shuffle(input2, comp + add1); - - // Swap elements - dir = local_id % 2; - temp = input1; - comp = (abs(input1 > input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, comp); - input2 = shuffle2(input2, temp, comp); - VECTOR_SORT_BOOK(input1, dir); - VECTOR_SORT_BOOK(input2, dir); - l_data[id] = input1; - l_data[id+1] = input2; - - // Perform upper stages - for(size = 2; size < local_size; size <<= 1) - { - dir = local_id/size & 1; - - //Perform lower stages - for(stride = size; stride > 1; stride >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - id = local_id + (local_id/stride)*stride; - VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) - } - barrier(CLK_LOCAL_MEM_FENCE); - - id = local_id * 2; - input1 = l_data[id]; - input2 = l_data[id+1]; - temp = input1; - comp = (abs(input1 > input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, comp); - input2 = shuffle2(input2, temp, comp); - VECTOR_SORT_BOOK(input1, dir); - VECTOR_SORT_BOOK(input2, dir); - l_data[id] = input1; - l_data[id+1] = input2; - } - dir = group_id % 2; - - // Perform bitonic merge - for(stride = local_size; stride > 1; stride >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - id = local_id + (local_id/stride)*stride; - VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) - } - barrier(CLK_LOCAL_MEM_FENCE); - - id = local_id * 2; - input1 = l_data[id]; input2 = l_data[id+1]; - temp = input1; - comp = (abs(input1 > input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, comp); - input2 = shuffle2(input2, temp, comp); - VECTOR_SORT_BOOK(input1, dir); - VECTOR_SORT_BOOK(input2, dir); - - // setup output and return it - output = (float8)(input1, input2); - return output; -} - - - -////////////// -// Kernels -////////////// - -// Perform the sort on the whole array -// dim0: wg=number_of_element/8 - -kernel void bsort_all(global float4 *g_data, - local float4 *l_data) -{ - float4 input1, input2; - float8 input, output; - uint id, global_start; - // Find global address - id = get_local_id(0) * 2; - global_start = get_group_id(0) * get_local_size(0) * 2 + id; - - input1 = g_data[global_start]; - input2 = g_data[global_start+1]; - input = (float8)(input1, input2); - output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), - input, l_data); - input1 = (float4) (output.s0, output.s1, output.s2, output.s3); - input2 = (float4) (output.s4, output.s5, output.s6, output.s7); - g_data[global_start] = input1; - g_data[global_start+1] = input2; -} - - -// Perform the sort along the horizontal axis of a 2D image -// dim0 = y: wg=1 -// dim1 = x: wg=number_of_element/8 -kernel void bsort_horizontal(global float *g_data, - local float4 *l_data) -{ - float8 input, output; - uint id, global_start, offset; - - // Find global address - offset = get_global_size(1)*get_global_id(0)*8; - id = get_local_id(1) * 8; - global_start = offset + get_group_id(1) * get_local_size(1) * 8 + id; - - input = (float8)(g_data[global_start ], - g_data[global_start + 1], - g_data[global_start + 2], - g_data[global_start + 3], - g_data[global_start + 4], - g_data[global_start + 5], - g_data[global_start + 6], - g_data[global_start + 7]); - - output = my_sort_file(get_local_id(1), get_group_id(1), get_local_size(1), - input, l_data); - - g_data[global_start ] = output.s0; - g_data[global_start + 1] = output.s1; - g_data[global_start + 2] = output.s2; - g_data[global_start + 3] = output.s3; - g_data[global_start + 4] = output.s4; - g_data[global_start + 5] = output.s5; - g_data[global_start + 6] = output.s6; - g_data[global_start + 7] = output.s7; -} - - -// Perform the sort along the vertical axis -// dim0 = y: wg=number_of_element/8 -// dim1 = x: wg=1 -// check if transposing +bsort_horizontal is not more efficient ? - -kernel void bsort_vertical(global float *g_data, - local float4 *l_data) -{ - // we need to read 8 float position along the vertical axis - float8 input, output; - uint id, global_start, padding; - - // Find global address - padding = get_global_size(1); - id = get_local_id(0) * 8 * padding + get_global_id(1); - global_start = get_group_id(0) * get_local_size(0) * 8 * padding + id; - - input = (float8)(g_data[global_start ], - g_data[global_start + padding ], - g_data[global_start + 2*padding], - g_data[global_start + 3*padding], - g_data[global_start + 4*padding], - g_data[global_start + 5*padding], - g_data[global_start + 6*padding], - g_data[global_start + 7*padding]); - - output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), - input, l_data); - g_data[global_start ] = output.s0; - g_data[global_start + padding ] = output.s1; - g_data[global_start + 2*padding ] = output.s2; - g_data[global_start + 3*padding ] = output.s3; - g_data[global_start + 4*padding ] = output.s4; - g_data[global_start + 5*padding ] = output.s5; - g_data[global_start + 6*padding ] = output.s6; - g_data[global_start + 7*padding ] = output.s7; -} - - -//Tested working reference kernel frm the book. This only works under Linux -kernel void bsort_book(global float4 *g_data, - local float4 *l_data) { - float4 input1, input2, temp; - uint4 comp, swap, mask1, mask2, add1, add2, add3; - uint id, dir, global_start, size, stride; - mask1 = (uint4)(1, 0, 3, 2); - swap = (uint4)(0, 0, 1, 1); - add1 = (uint4)(0, 0, 2, 2); - mask2 = (uint4)(2, 3, 0, 1); - add2 = (uint4)(0, 1, 0, 1); - add3 = (uint4)(0, 1, 2, 3); - - // Find global address - id = get_local_id(0) * 2; - global_start = get_group_id(0) * get_local_size(0) * 2 + id; - - //Sort first vector - input1 = g_data[global_start]; - input2 = g_data[global_start+1]; - comp = abs(input1 > shuffle(input1, mask1)); - input1 = shuffle(input1, comp ^ swap + add1); - comp = abs(input1 > shuffle(input1, mask2)); - input1 = shuffle(input1, comp * 2 + add2); - comp = abs(input1 > shuffle(input1, mask1)); - input1 = shuffle(input1, comp + add1); - - //Sort second vector - comp = abs(input2 < shuffle(input2, mask1)); - input2 = shuffle(input2, comp ^ swap + add1); - comp = abs(input2 < shuffle(input2, mask2)); - input2 = shuffle(input2, comp * 2 + add2); - comp = abs(input2 < shuffle(input2, mask1)); - input2 = shuffle(input2, comp + add1); - - // Swap elements - dir = get_local_id(0) % 2; - temp = input1; - comp = (abs(input1 > input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, comp); - input2 = shuffle2(input2, temp, comp); - VECTOR_SORT_BOOK(input1, dir); - VECTOR_SORT_BOOK(input2, dir); - l_data[id] = input1; - l_data[id+1] = input2; - - // Perform upper stages - for(size = 2; size < get_local_size(0); size <<= 1) { - dir = get_local_id(0)/size & 1; - - //Perform lower stages - for(stride = size; stride > 1; stride >>= 1) { - barrier(CLK_LOCAL_MEM_FENCE); - id = get_local_id(0) + - (get_local_id(0)/stride)*stride; - VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) - } - barrier(CLK_LOCAL_MEM_FENCE); - - id = get_local_id(0) * 2; - input1 = l_data[id]; - input2 = l_data[id+1]; - temp = input1; - comp = (abs(input1 > input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, comp); - input2 = shuffle2(input2, temp, comp); - VECTOR_SORT_BOOK(input1, dir); - VECTOR_SORT_BOOK(input2, dir); - l_data[id] = input1; - l_data[id+1] = input2; - } - dir = get_group_id(0) % 2; - // Perform bitonic merge - for(stride = get_local_size(0); stride > 1; stride >>= 1) { - barrier(CLK_LOCAL_MEM_FENCE); - id = get_local_id(0) + - (get_local_id(0)/stride)*stride; - VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) - } - barrier(CLK_LOCAL_MEM_FENCE); - - id = get_local_id(0) * 2; - input1 = l_data[id]; input2 = l_data[id+1]; - temp = input1; - comp = (abs(input1 > input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, comp); - input2 = shuffle2(input2, temp, comp); - VECTOR_SORT_BOOK(input1, dir); - VECTOR_SORT_BOOK(input2, dir); - g_data[global_start] = input1; - g_data[global_start+1] = input2; - } - -//Tested working reference kernel from the addition files. This only works under any operating system -/* Perform initial sort */ -kernel void bsort_file(global float4 *g_data, local float4 *l_data) { - - int dir; - uint id, global_start, size, stride; - float4 input1, input2, temp; - int4 comp; - - uint4 mask1 = (uint4)(1, 0, 3, 2); - uint4 mask2 = (uint4)(2, 3, 0, 1); - uint4 mask3 = (uint4)(3, 2, 1, 0); - - int4 add1 = (int4)(1, 1, 3, 3); - int4 add2 = (int4)(2, 3, 2, 3); - int4 add3 = (int4)(1, 2, 2, 3); - - id = get_local_id(0) * 2; - global_start = get_group_id(0) * get_local_size(0) * 2 + id; - - input1 = g_data[global_start]; - input2 = g_data[global_start+1]; - - /* Sort input 1 - ascending */ - comp = input1 < shuffle(input1, mask1); - input1 = shuffle(input1, as_uint4(comp + add1)); - comp = input1 < shuffle(input1, mask2); - input1 = shuffle(input1, as_uint4(comp * 2 + add2)); - comp = input1 < shuffle(input1, mask3); - input1 = shuffle(input1, as_uint4(comp + add3)); - - /* Sort input 2 - descending */ - comp = input2 > shuffle(input2, mask1); - input2 = shuffle(input2, as_uint4(comp + add1)); - comp = input2 > shuffle(input2, mask2); - input2 = shuffle(input2, as_uint4(comp * 2 + add2)); - comp = input2 > shuffle(input2, mask3); - input2 = shuffle(input2, as_uint4(comp + add3)); - - /* Swap corresponding elements of input 1 and 2 */ - add3 = (int4)(4, 5, 6, 7); - dir = - (int)(get_local_id(0) % 2); - temp = input1; - comp = ((input1 < input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, as_uint4(comp)); - input2 = shuffle2(input2, temp, as_uint4(comp)); - - /* Sort data and store in local memory */ - VECTOR_SORT_FILE(input1, dir); - VECTOR_SORT_FILE(input2, dir); - l_data[id] = input1; - l_data[id+1] = input2; - - /* Create bitonic set */ - for(size = 2; size < get_local_size(0); size <<= 1) { - dir = - (int)(get_local_id(0)/size & 1); - - for(stride = size; stride > 1; stride >>= 1) { - barrier(CLK_LOCAL_MEM_FENCE); - id = get_local_id(0) + (get_local_id(0)/stride)*stride; - VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) - } - - barrier(CLK_LOCAL_MEM_FENCE); - id = get_local_id(0) * 2; - input1 = l_data[id]; input2 = l_data[id+1]; - temp = input1; - comp = ((input1 < input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, as_uint4(comp)); - input2 = shuffle2(input2, temp, as_uint4(comp)); - VECTOR_SORT_FILE(input1, dir); - VECTOR_SORT_FILE(input2, dir); - l_data[id] = input1; - l_data[id+1] = input2; - } - - /* Perform bitonic merge */ - dir = - (int)(get_group_id(0) % 2); - for(stride = get_local_size(0); stride > 1; stride >>= 1) { - barrier(CLK_LOCAL_MEM_FENCE); - id = get_local_id(0) + (get_local_id(0)/stride)*stride; - VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) - } - barrier(CLK_LOCAL_MEM_FENCE); - - /* Perform final sort */ - id = get_local_id(0) * 2; - input1 = l_data[id]; input2 = l_data[id+1]; - temp = input1; - comp = ((input1 < input2) ^ dir) * 4 + add3; - input1 = shuffle2(input1, input2, as_uint4(comp)); - input2 = shuffle2(input2, temp, as_uint4(comp)); - VECTOR_SORT_FILE(input1, dir); - VECTOR_SORT_FILE(input2, dir); - g_data[global_start] = input1; - g_data[global_start+1] = input2; -} - diff --git a/silx/resources/opencl/codec/byte_offset.cl b/silx/resources/opencl/codec/byte_offset.cl deleted file mode 100644 index 56a24c4..0000000 --- a/silx/resources/opencl/codec/byte_offset.cl +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Project: SILX: A data analysis tool-kit - * - * Copyright (C) 2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: J. Kieffer (kieffer@esrf.fr) - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* To decompress CBF byte-offset compressed in parallel on GPU one needs to: - * - Set all values in mask and exception counter to zero. - * - Mark regions with exceptions and set values without exception. - * This generates the values (zeros for exceptions), the exception mask, - * counts the number of exception region and provides a start position for - * each exception. - * - Treat exceptions. For this, one thread in a workgoup treats a complete - * masked region in a serial fashion. All regions are treated in parallel. - * Values written at this stage are marked in the mask with -1. - * - Double scan: inclusive cum sum for values, exclusive cum sum to generate - * indices in output array. Values with mask = 1 are considered as 0. - * - Compact and copy output by removing duplicated values in exceptions. - */ - -kernel void mark_exceptions(global char* raw, - int size, - int full_size, - global int* mask, - global int* values, - global int* cnt, - global int* exc) -{ - int gid; - gid = get_global_id(0); - if (gid<size) - { - int value, position; - value = raw[gid]; - if (value == -128) - { - int maxi; - values[gid] = 0; - position = atomic_inc(cnt); - exc[position] = gid; - maxi = size - 1; - mask[gid] = 1; - mask[min(maxi, gid+1)] = 1; - mask[min(maxi, gid+2)] = 1; - - if (((int) raw[min(gid+1, maxi)] == 0) && - ((int) raw[min(gid+2, maxi)] == -128)) - { - mask[min(maxi, gid+3)] = 1; - mask[min(maxi, gid+4)] = 1; - mask[min(maxi, gid+5)] = 1; - mask[min(maxi, gid+6)] = 1; - } - } - else - { // treat simple data - - values[gid] = value; - } - } - else if (gid<full_size) - { - mask[gid]=1; - values[gid] = 0; - } -} - -//run with WG=1, as may as exceptions -kernel void treat_exceptions(global char* raw, //raw compressed stream - int size, //size of the raw compressed stream - global int* mask, //tells if the value is masked - global int* exc, //array storing the position of the start of exception zones - global int* values)// stores decompressed values. -{ - int gid = get_global_id(0); - int inp_pos = exc[gid]; - if ((inp_pos<=0) || ((int)mask[inp_pos - 1] == 0)) - { - int value, is_masked, next_value, inc; - is_masked = (mask[inp_pos] != 0); - while ((is_masked) && (inp_pos<size)) - { - value = (int) raw[inp_pos]; - if (value == -128) - { // this correspond to 16 bits exception - uchar low_byte = raw[inp_pos+1]; - char high_byte = raw[inp_pos+2] ; - next_value = high_byte<<8 | low_byte; - if (next_value == -32768) - { // this correspond to 32 bits exception - uchar low_byte1 = raw[inp_pos+3], - low_byte2 = raw[inp_pos+4], - low_byte3 = raw[inp_pos+5]; - char high_byte4 = raw[inp_pos+6] ; - value = high_byte4<<24 | low_byte3<<16 | low_byte2<<8 | low_byte1; - inc = 7; - } - else - { - value = next_value; - inc = 3; - } - } - else - { - inc = 1; - } - values[inp_pos] = value; - mask[inp_pos] = -1; // mark the processed data as valid in the mask - inp_pos += inc; - is_masked = (mask[inp_pos] != 0); - } - } -} - -// copy the values of the elements to definitive position -kernel void copy_result_int(global int* values, - global int* indexes, - int in_size, - int out_size, - global int* output - ) -{ - int gid = get_global_id(0); - if (gid < in_size) - { - int current = max(indexes[gid], 0), - next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1]; - //we keep always the last element - if ((current <= out_size) && (current < next)) - { - output[current] = values[gid]; - } - } -} - -// copy the values of the elements to definitive position -kernel void copy_result_float(global int* values, - global int* indexes, - int in_size, - int out_size, - global float* output - ) -{ - int gid = get_global_id(0); - if (gid<in_size) - { - int current = max(indexes[gid], 0), - next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1]; - if ((current < out_size) && (current < next)) - { - output[current] = (float) values[gid]; - } - } -} - - -// combined memset for all arrays used for Byte Offset decompression -kernel void byte_offset_memset(global char* raw, - global int* mask, - global int* index, - global int* result, - int full_size, - int actual_size - ) -{ - int gid = get_global_id(0); - if (gid < full_size) - { - raw[gid] = 0; - index[gid] = 0; - result[gid] = 0; - if (gid<actual_size) - { - mask[gid] = 0; - } - else - { - mask[gid] = 1; - } - - } -} - - -//Simple memset kernel for char arrays -kernel void fill_char_mem(global char* ary, - int size, - char pattern, - int start_at) -{ - int gid = get_global_id(0); - if ((gid >= start_at) && (gid < size)) - { - ary[gid] = pattern; - } -} - -//Simple memset kernel for int arrays -kernel void fill_int_mem(global int* ary, - int size, - int pattern, - int start_at) -{ - int gid = get_global_id(0); - if ((gid >= start_at) && (gid < size)) - { - ary[gid] = pattern; - } -} - diff --git a/silx/resources/opencl/convolution.cl b/silx/resources/opencl/convolution.cl deleted file mode 100644 index 629b8fc..0000000 --- a/silx/resources/opencl/convolution.cl +++ /dev/null @@ -1,312 +0,0 @@ -#define MAX_CONST_SIZE 16384 - -/******************************************************************************/ -/**************************** Macros ******************************************/ -/******************************************************************************/ - -// Get the center index of the filter, -// and the "half-Left" and "half-Right" lengths. -// In the case of an even-sized filter, the center is shifted to the left. -#define GET_CENTER_HL(hlen){\ - if (hlen & 1) {\ - c = hlen/2;\ - hL = c;\ - hR = c;\ - }\ - else {\ - c = hlen/2 - 1;\ - hL = c;\ - hR = c+1;\ - }\ -}\ - -// Boundary handling modes -#define CONV_MODE_REFLECT 0 // cba|abcd|dcb -#define CONV_MODE_NEAREST 1 // aaa|abcd|ddd -#define CONV_MODE_WRAP 2 // bcd|abcd|abc -#define CONV_MODE_CONSTANT 3 // 000|abcd|000 -#ifndef USED_CONV_MODE - #define USED_CONV_MODE CONV_MODE_NEAREST -#endif - -#define CONV_PERIODIC_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x += Nx; if (idx_x >= Nx) idx_x -= Nx; -#define CONV_PERIODIC_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y += Ny; if (idx_y >= Ny) idx_y -= Ny; -#define CONV_PERIODIC_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z += Nz; if (idx_z >= Nz) idx_z -= Nz; - -#define CONV_NEAREST_IDX_X int idx_x = clamp((int) (gidx - c + jx), 0, Nx-1); -#define CONV_NEAREST_IDX_Y int idx_y = clamp((int) (gidy - c + jy), 0, Ny-1); -#define CONV_NEAREST_IDX_Z int idx_z = clamp((int) (gidz - c + jz), 0, Nz-1); - -#define CONV_REFLECT_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x = -idx_x-1; if (idx_x >= Nx) idx_x = Nx-(idx_x-(Nx-1)); -#define CONV_REFLECT_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y = -idx_y-1; if (idx_y >= Ny) idx_y = Ny-(idx_y-(Ny-1)); -#define CONV_REFLECT_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z = -idx_z-1; if (idx_z >= Nz) idx_z = Nz-(idx_z-(Nz-1)); - - -#if USED_CONV_MODE == CONV_MODE_REFLECT - #define CONV_IDX_X CONV_REFLECT_IDX_X - #define CONV_IDX_Y CONV_REFLECT_IDX_Y - #define CONV_IDX_Z CONV_REFLECT_IDX_Z -#elif USED_CONV_MODE == CONV_MODE_NEAREST - #define CONV_IDX_X CONV_NEAREST_IDX_X - #define CONV_IDX_Y CONV_NEAREST_IDX_Y - #define CONV_IDX_Z CONV_NEAREST_IDX_Z -#elif USED_CONV_MODE == CONV_MODE_WRAP - #define CONV_IDX_X CONV_PERIODIC_IDX_X - #define CONV_IDX_Y CONV_PERIODIC_IDX_Y - #define CONV_IDX_Z CONV_PERIODIC_IDX_Z -#elif USED_CONV_MODE == CONV_MODE_CONSTANT - #error "constant not implemented yet" -#else - #error "Unknown convolution mode" -#endif - - - -// Image access patterns -#define READ_IMAGE_1D_X input[(gidz*Ny + gidy)*Nx + idx_x] -#define READ_IMAGE_1D_Y input[(gidz*Ny + idx_y)*Nx + gidx] -#define READ_IMAGE_1D_Z input[(idx_z*Ny + gidy)*Nx + gidx] - -#define READ_IMAGE_2D_XY input[(gidz*Ny + idx_y)*Nx + idx_x] -#define READ_IMAGE_2D_XZ input[(idx_z*Ny + gidy)*Nx + idx_x] -#define READ_IMAGE_2D_YZ input[(idx_z*Ny + idx_y)*Nx + gidx] - -#define READ_IMAGE_3D_XYZ input[(idx_z*Ny + idx_y)*Nx + idx_x] - - - -/******************************************************************************/ -/**************************** 1D Convolution **********************************/ -/******************************************************************************/ - - -// Convolution with 1D kernel along axis "X" (fast dimension) -// Works for batched 1D on 2D and batched 2D on 3D, along axis "X". -__kernel void convol_1D_X( - const __global float * input, - __global float * output, - __global float * filter, - int L, // filter size - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(L); - float sum = 0.0f; - - for (int jx = 0; jx <= hR+hL; jx++) { - CONV_IDX_X; // Get index "x" - sum += READ_IMAGE_1D_X * filter[L-1 - jx]; - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} - - -// Convolution with 1D kernel along axis "Y" -// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y". -__kernel void convol_1D_Y( - const __global float * input, - __global float * output, - __global float * filter, - int L, // filter size - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(L); - float sum = 0.0f; - - for (int jy = 0; jy <= hR+hL; jy++) { - CONV_IDX_Y; // Get index "y" - sum += READ_IMAGE_1D_Y * filter[L-1 - jy]; - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} - - -// Convolution with 1D kernel along axis "Z" -// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z". -__kernel void convol_1D_Z( - const __global float * input, - __global float * output, - __global float * filter, - int L, // filter size - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(L); - float sum = 0.0f; - - for (int jz = 0; jz <= hR+hL; jz++) { - CONV_IDX_Z; // Get index "z" - sum += READ_IMAGE_1D_Z * filter[L-1 - jz]; - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} - - -/******************************************************************************/ -/**************************** 2D Convolution **********************************/ -/******************************************************************************/ - -// Convolution with 2D kernel -// Works for batched 2D on 3D. -__kernel void convol_2D_XY( - const __global float * input, - __global float * output, - __global float * filter, - int Lx, // filter number of columns, - int Ly, // filter number of rows, - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Lx); - float sum = 0.0f; - - for (int jy = 0; jy <= hR+hL; jy++) { - CONV_IDX_Y; // Get index "y" - for (int jx = 0; jx <= hR+hL; jx++) { - CONV_IDX_X; // Get index "x" - sum += READ_IMAGE_2D_XY * filter[(Ly-1-jy)*Lx + (Lx-1 - jx)]; - } - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} - - -// Convolution with 2D kernel -// Works for batched 2D on 3D. -__kernel void convol_2D_XZ( - const __global float * input, - __global float * output, - __global float * filter, - int Lx, // filter number of columns, - int Lz, // filter number of rows, - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Lx); - float sum = 0.0f; - - for (int jz = 0; jz <= hR+hL; jz++) { - CONV_IDX_Z; // Get index "z" - for (int jx = 0; jx <= hR+hL; jx++) { - CONV_IDX_X; // Get index "x" - sum += READ_IMAGE_2D_XZ * filter[(Lz-1-jz)*Lx + (Lx-1 - jx)]; - } - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} - - -// Convolution with 2D kernel -// Works for batched 2D on 3D. -__kernel void convol_2D_YZ( - const __global float * input, - __global float * output, - __global float * filter, - int Ly, // filter number of columns, - int Lz, // filter number of rows, - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Ly); - float sum = 0.0f; - - for (int jz = 0; jz <= hR+hL; jz++) { - CONV_IDX_Z; // Get index "z" - for (int jy = 0; jy <= hR+hL; jy++) { - CONV_IDX_Y; // Get index "y" - sum += READ_IMAGE_2D_YZ * filter[(Lz-1-jz)*Ly + (Ly-1 - jy)]; - } - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} - - - -/******************************************************************************/ -/**************************** 3D Convolution **********************************/ -/******************************************************************************/ - -// Convolution with 3D kernel -__kernel void convol_3D_XYZ( - const __global float * input, - __global float * output, - __global float * filter, - int Lx, // filter number of columns, - int Ly, // filter number of rows, - int Lz, // filter number of rows, - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Lx); - float sum = 0.0f; - - for (int jz = 0; jz <= hR+hL; jz++) { - CONV_IDX_Z; // Get index "z" - for (int jy = 0; jy <= hR+hL; jy++) { - CONV_IDX_Y; // Get index "y" - for (int jx = 0; jx <= hR+hL; jx++) { - CONV_IDX_X; // Get index "x" - sum += READ_IMAGE_3D_XYZ * filter[((Lz-1-jz)*Ly + (Ly-1-jy))*Lx + (Lx-1 - jx)]; - } - } - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} - diff --git a/silx/resources/opencl/convolution_textures.cl b/silx/resources/opencl/convolution_textures.cl deleted file mode 100644 index 517a67c..0000000 --- a/silx/resources/opencl/convolution_textures.cl +++ /dev/null @@ -1,374 +0,0 @@ -/******************************************************************************/ -/**************************** Macros ******************************************/ -/******************************************************************************/ - -// Error handling -#ifndef IMAGE_DIMS - #error "IMAGE_DIMS must be defined" -#endif -#ifndef FILTER_DIMS - #error "FILTER_DIMS must be defined" -#endif -#if FILTER_DIMS > IMAGE_DIMS - #error "Filter cannot have more dimensions than image" -#endif - -// Boundary handling modes -#define CONV_MODE_REFLECT 0 // CLK_ADDRESS_MIRRORED_REPEAT : cba|abcd|dcb -#define CONV_MODE_NEAREST 1 // CLK_ADDRESS_CLAMP_TO_EDGE : aaa|abcd|ddd -#define CONV_MODE_WRAP 2 // CLK_ADDRESS_REPEAT : bcd|abcd|abc -#define CONV_MODE_CONSTANT 3 // CLK_ADDRESS_CLAMP : 000|abcd|000 -#ifndef USED_CONV_MODE - #define USED_CONV_MODE CONV_MODE_NEAREST -#endif -#if USED_CONV_MODE == CONV_MODE_REFLECT - #define CLK_BOUNDARY CLK_ADDRESS_MIRRORED_REPEAT - #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE - #define USE_NORM_COORDS -#elif USED_CONV_MODE == CONV_MODE_NEAREST - #define CLK_BOUNDARY CLK_ADDRESS_CLAMP_TO_EDGE - #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE -#elif USED_CONV_MODE == CONV_MODE_WRAP - #define CLK_BOUNDARY CLK_ADDRESS_REPEAT - #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE - #define USE_NORM_COORDS -#elif USED_CONV_MODE == CONV_MODE_CONSTANT - #define CLK_BOUNDARY CLK_ADDRESS_CLAMP - #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE -#else - #error "Unknown convolution mode" -#endif - - - -// Convolution index for filter -#define FILTER_INDEX(j) (Lx - 1 - j) - -// Filter access patterns -#define READ_FILTER_1D(j) read_imagef(filter, (int2) (FILTER_INDEX(j), 0)).x; -#define READ_FILTER_2D(jx, jy) read_imagef(filter, (int2) (FILTER_INDEX(jx), FILTER_INDEX(jy))).x; -#define READ_FILTER_3D(jx, jy, jz) read_imagef(filter, (int4) (FILTER_INDEX(jx), FILTER_INDEX(jy), FILTER_INDEX(jz), 0)).x; - - -// Convolution index for image -#ifdef USE_NORM_COORDS - #define IMAGE_INDEX_X (gidx*1.0f +0.5f - c + jx)/Nx - #define IMAGE_INDEX_Y (gidy*1.0f +0.5f - c + jy)/Ny - #define IMAGE_INDEX_Z (gidz*1.0f +0.5f - c + jz)/Nz - #define RET_TYPE_1 float - #define RET_TYPE_2 float2 - #define RET_TYPE_4 float4 - #define C_ZERO 0.5f - #define GIDX (gidx*1.0f + 0.5f)/Nx - #define GIDY (gidy*1.0f + 0.5f)/Ny - #define GIDZ (gidz*1.0f + 0.5f)/Nz -#else - #define IMAGE_INDEX_X (gidx - c + jx) - #define IMAGE_INDEX_Y (gidy - c + jy) - #define IMAGE_INDEX_Z (gidz - c + jz) - #define RET_TYPE_1 int - #define RET_TYPE_2 int2 - #define RET_TYPE_4 int4 - #define C_ZERO 0 - #define GIDX gidx - #define GIDY gidy - #define GIDZ gidz -#endif - -static const sampler_t sampler = CLK_COORDS | CLK_BOUNDARY | CLK_FILTER_NEAREST; - -// Image access patterns -#define READ_IMAGE_1D read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, C_ZERO)).x - -#define READ_IMAGE_2D_X read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X , GIDY)).x -#define READ_IMAGE_2D_Y read_imagef(input, sampler, (RET_TYPE_2) (GIDX, IMAGE_INDEX_Y)).x -#define READ_IMAGE_2D_XY read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, IMAGE_INDEX_Y)).x - -#define READ_IMAGE_3D_X read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, GIDZ, C_ZERO)).x -#define READ_IMAGE_3D_Y read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x -#define READ_IMAGE_3D_Z read_imagef(input, sampler, (RET_TYPE_4) (GIDX, GIDY, IMAGE_INDEX_Z, C_ZERO)).x -#define READ_IMAGE_3D_XY read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x -#define READ_IMAGE_3D_XZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, IMAGE_INDEX_Z, C_ZERO)).x -#define READ_IMAGE_3D_YZ read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x -#define READ_IMAGE_3D_XYZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x - - -// NOTE: pyopencl and OpenCL < 1.2 do not support image1d_t -#if FILTER_DIMS == 1 - #define FILTER_TYPE image2d_t - #define READ_FILTER_VAL(j) READ_FILTER_1D(j) -#elif FILTER_DIMS == 2 - #define FILTER_TYPE image2d_t - #define READ_FILTER_VAL(jx, jy) READ_FILTER_2D(jx, jy) -#elif FILTER_DIMS == 3 - #define FILTER_TYPE image3d_t - #define READ_FILTER_VAL(jx, jy, jz) READ_FILTER_3D(jx, jy, jz) -#endif - -#if IMAGE_DIMS == 1 - #define IMAGE_TYPE image2d_t - #define READ_IMAGE_X READ_IMAGE_1D -#elif IMAGE_DIMS == 2 - #define IMAGE_TYPE image2d_t - #define READ_IMAGE_X READ_IMAGE_2D_X - #define READ_IMAGE_Y READ_IMAGE_2D_Y - #define READ_IMAGE_XY READ_IMAGE_2D_XY -#elif IMAGE_DIMS == 3 - #define IMAGE_TYPE image3d_t - #define READ_IMAGE_X READ_IMAGE_3D_X - #define READ_IMAGE_Y READ_IMAGE_3D_Y - #define READ_IMAGE_Z READ_IMAGE_3D_Z - #define READ_IMAGE_XY READ_IMAGE_3D_XY - #define READ_IMAGE_XZ READ_IMAGE_3D_XZ - #define READ_IMAGE_YZ READ_IMAGE_3D_YZ - #define READ_IMAGE_XYZ READ_IMAGE_3D_XYZ -#endif - - -// Get the center index of the filter, -// and the "half-Left" and "half-Right" lengths. -// In the case of an even-sized filter, the center is shifted to the left. -#define GET_CENTER_HL(hlen){\ - if (hlen & 1) {\ - c = hlen/2;\ - hL = c;\ - hR = c;\ - }\ - else {\ - c = hlen/2 - 1;\ - hL = c;\ - hR = c+1;\ - }\ -}\ - - - -/******************************************************************************/ -/**************************** 1D Convolution **********************************/ -/******************************************************************************/ - -#if FILTER_DIMS == 1 -// Convolution with 1D kernel along axis "X" (fast dimension) -// Works for batched 1D on 2D and batched 2D on 3D, along axis "X". -__kernel void convol_1D_X_tex( - read_only IMAGE_TYPE input, - __global float * output, - read_only FILTER_TYPE filter, - int Lx, // filter size - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Lx); - float sum = 0.0f; - - for (int jx = 0; jx <= hR+hL; jx++) { - sum += READ_IMAGE_X * READ_FILTER_VAL(jx); - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} - - -#if IMAGE_DIMS >= 2 -// Convolution with 1D kernel along axis "Y" -// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y". -__kernel void convol_1D_Y_tex( - read_only IMAGE_TYPE input, - __global float * output, - read_only FILTER_TYPE filter, - int Lx, // filter size - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Lx); - float sum = 0.0f; - - for (int jy = 0; jy <= hR+hL; jy++) { - sum += READ_IMAGE_Y * READ_FILTER_VAL(jy); - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} -#endif - -#if IMAGE_DIMS == 3 -// Convolution with 1D kernel along axis "Z" -// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z". -__kernel void convol_1D_Z_tex( - read_only IMAGE_TYPE input, - __global float * output, - read_only FILTER_TYPE filter, - int Lx, // filter size - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Lx); - float sum = 0.0f; - - for (int jz = 0; jz <= hR+hL; jz++) { - sum += READ_IMAGE_Z * READ_FILTER_VAL(jz); - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} -#endif -#endif - -/******************************************************************************/ -/**************************** 2D Convolution **********************************/ -/******************************************************************************/ - -#if IMAGE_DIMS >= 2 && FILTER_DIMS == 2 -// Convolution with 2D kernel -// Works for batched 2D on 3D. -__kernel void convol_2D_XY_tex( - read_only IMAGE_TYPE input, - __global float * output, - read_only FILTER_TYPE filter, - int Lx, // filter number of columns, - int Ly, // filter number of rows, - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Lx); - float sum = 0.0f; - - for (int jy = 0; jy <= hR+hL; jy++) { - for (int jx = 0; jx <= hR+hL; jx++) { - sum += READ_IMAGE_XY * READ_FILTER_VAL(jx, jy); - } - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} -#endif - -#if IMAGE_DIMS == 3 && FILTER_DIMS == 2 -// Convolution with 2D kernel -// Works for batched 2D on 3D. -__kernel void convol_2D_XZ_tex( - read_only IMAGE_TYPE input, - __global float * output, - read_only FILTER_TYPE filter, - int Lx, // filter number of columns, - int Lz, // filter number of rows, - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Lx); - float sum = 0.0f; - - for (int jz = 0; jz <= hR+hL; jz++) { - for (int jx = 0; jx <= hR+hL; jx++) { - sum += READ_IMAGE_XZ * READ_FILTER_VAL(jx, jz); - } - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} - - -// Convolution with 2D kernel -// Works for batched 2D on 3D. -__kernel void convol_2D_YZ_tex( - read_only IMAGE_TYPE input, - __global float * output, - read_only FILTER_TYPE filter, - int Lx, // filter number of columns, - int Lz, // filter number of rows, - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Lx); - float sum = 0.0f; - - for (int jz = 0; jz <= hR+hL; jz++) { - for (int jy = 0; jy <= hR+hL; jy++) { - sum += READ_IMAGE_YZ * READ_FILTER_VAL(jy, jz); - } - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} -#endif - - -/******************************************************************************/ -/**************************** 3D Convolution **********************************/ -/******************************************************************************/ - -#if IMAGE_DIMS == 3 && FILTER_DIMS == 3 -// Convolution with 3D kernel -__kernel void convol_3D_XYZ_tex( - read_only IMAGE_TYPE input, - __global float * output, - read_only FILTER_TYPE filter, - int Lx, // filter number of columns, - int Ly, // filter number of rows, - int Lz, // filter number of rows, - int Nx, // input/output number of columns - int Ny, // input/output number of rows - int Nz // input/output depth -) -{ - uint gidx = get_global_id(0); - uint gidy = get_global_id(1); - uint gidz = get_global_id(2); - if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; - - int c, hL, hR; - GET_CENTER_HL(Lx); - float sum = 0.0f; - - for (int jz = 0; jz <= hR+hL; jz++) { - for (int jy = 0; jy <= hR+hL; jy++) { - for (int jx = 0; jx <= hR+hL; jx++) { - sum += READ_IMAGE_XYZ * READ_FILTER_VAL(jx, jy, jz); - } - } - } - output[(gidz*Ny + gidy)*Nx + gidx] = sum; -} -#endif diff --git a/silx/resources/opencl/doubleword.cl b/silx/resources/opencl/doubleword.cl deleted file mode 100644 index a0ebfda..0000000 --- a/silx/resources/opencl/doubleword.cl +++ /dev/null @@ -1,115 +0,0 @@ -/* - * OpenCL library for double word floating point calculation using compensated arithmetics - * - * The theoritical basis can be found in Valentina Popescu's PhD thesis: - * Towards fast and certified multi-precision libraries - * Reference LYSEN036 - * http://www.theses.fr/2017LYSEN036 - * All page number and equation number are refering to this document. - * - * The precision of the calculation (bounds) is provided in ULP (smallest possible mantissa) - * and come from the table 2.2 (page 68 of the thesis). - * The number of equivalent FLOP is taken from the table 2.3 (page 69 the thesis). - * Note that FLOP are not all equal: a division is much more expensive than an addition. - */ - -//This library can be expanded to double-double by redefining fp, fp2 and one to double, double2 and 1.0. -#ifdef DOUBLEDOUBLE -#define fp double -#define fp2 double2 -#define one 1.0 -#else -#define fp float -#define fp2 float2 -#define one 1.0f -#endif - -/* Nota: i386 computer use x87 registers which are larger than the 32bits precision - * which can invalidate the error compensation mechanism. - * - * We use the trick to declare some variable "volatile" to enforce the actual - * precision reduction of those variables. -*/ - -#ifndef X87_VOLATILE -# define X87_VOLATILE -#endif - -//Algorithm 1, p23, theorem 1.1.12. Requires e_x > e_y, valid if |x| > |y| -inline fp2 fast_fp_plus_fp(fp x, fp y){ - X87_VOLATILE fp s = x + y; - X87_VOLATILE fp z = s - x; - fp e = y - z; - return (fp2)(s, e); -} - -//Algorithm 2, p24, same as fast_fp_plus_fp without the condition on e_x and e_y -inline fp2 fp_plus_fp(fp x, fp y){ - X87_VOLATILE fp s = x + y; - X87_VOLATILE fp xp = s - y; - X87_VOLATILE fp yp = s - xp; - X87_VOLATILE fp dx = x - xp; - X87_VOLATILE fp dy = y - yp; - return (fp2)(s, dx+dy); -} - -//Algorithm 3, p24: multiplication with a FMA -inline fp2 fp_times_fp(fp x, fp y){ - fp p = x * y; - fp e = fma(x, y, -p); - return (fp2)(p, e); -} - -//Algorithm 7, p38: Addition of a FP to a DW. 10flop bounds:2u²+5u³ -inline fp2 dw_plus_fp(fp2 x, fp y){ - fp2 s = fp_plus_fp(x.s0, y); - X87_VOLATILE fp v = x.s1 + s.s1; - return fast_fp_plus_fp(s.s0, v); -} - -//Algorithm 9, p40: addition of two DW: 20flop bounds:3u²+13u³ -inline fp2 dw_plus_dw(fp2 x, fp2 y){ - fp2 s = fp_plus_fp(x.s0, y.s0); - fp2 t = fp_plus_fp(x.s1, y.s1); - fp2 v = fast_fp_plus_fp(s.s0, s.s1 + t.s0); - return fast_fp_plus_fp(v.s0, t.s1 + v.s1); -} - -//Algorithm 12, p49: Multiplication FP*DW: 6flops bounds:2u² -inline fp2 dw_times_fp(fp2 x, fp y){ - fp2 c = fp_times_fp(x.s0, y); - return fast_fp_plus_fp(c.s0, fma(x.s1, y, c.s1)); -} - -//Algorithm 14, p52: Multiplication DW*DW, 8 flops bounds:6u² -inline fp2 dw_times_dw(fp2 x, fp2 y){ - fp2 c = fp_times_fp(x.s0, y.s0); - X87_VOLATILE fp l = fma(x.s1, y.s0, x.s0 * y.s1); - return fast_fp_plus_fp(c.s0, c.s1 + l); -} - -//Algorithm 17, p55: Division DW / FP, 10flops bounds: 3.5u² -inline fp2 dw_div_fp(fp2 x, fp y){ - X87_VOLATILE fp th = x.s0 / y; - fp2 pi = fp_times_fp(th, y); - fp2 d = x - pi; - X87_VOLATILE fp delta = d.s0 + d.s1; - X87_VOLATILE fp tl = delta/y; - return fast_fp_plus_fp(th, tl); -} - -//Derived from algorithm 20, p64: Inversion 1/ DW, 22 flops -inline fp2 inv_dw(fp2 y){ - X87_VOLATILE fp th = one/y.s0; - X87_VOLATILE fp rh = fma(-y.s0, th, one); - X87_VOLATILE fp rl = -y.s1 * th; - fp2 e = fast_fp_plus_fp(rh, rl); - fp2 delta = dw_times_fp(e, th); - return dw_plus_fp(delta, th); -} - -//Algorithm 20, p64: Division DW / DW, 30 flops: bounds:9.8u² -inline fp2 dw_div_dw(fp2 x, fp2 y){ - return dw_times_dw(x, inv_dw(y)); -} - diff --git a/silx/resources/opencl/image/cast.cl b/silx/resources/opencl/image/cast.cl deleted file mode 100644 index 9e23a82..0000000 --- a/silx/resources/opencl/image/cast.cl +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Project: SILX: Alogorithms for image processing - * - * Copyright (C) 2013-2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: J. Kieffer (kieffer@esrf.fr) - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef NB_COLOR - #define NB_COLOR 1 -#endif - - -/** - * \brief Cast values of an array of uint8 into a float output array. - * - * :param array_input: Pointer to global memory with the input data as unsigned8 array - * :param array_float: Pointer to global memory with the output data as float array - * :param width: Width of the image - * :param height: Height of the image - */ -kernel void u8_to_float( global unsigned char *array_input, - global float *array_float, - const int width, - const int height) -{ - //Global memory guard for padding - if ((get_global_id(0) < width) && (get_global_id(1)<height)) - { - int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); - for (int c=0; c<NB_COLOR; c++) - { - array_float[i + c] = (float) array_input[i + c]; - } //end loop over colors - } //end test in image -} //end kernel - -/** - * \brief Cast values of an array of uint8 into a float output array. - * - * :param array_input: Pointer to global memory with the input data as signed8 array - * :param array_float: Pointer to global memory with the output data as float array - * :param width: Width of the image - * :param height: Height of the image - */ -kernel void s8_to_float( global char *array_input, - global float *array_float, - const int width, - const int height) -{ - //Global memory guard for padding - if ((get_global_id(0) < width) && (get_global_id(1)<height)) - { - int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); - for (int c=0; c<NB_COLOR; c++) - { - array_float[i + c] = (float) array_input[i + c]; - } //end loop over colors - } //end test in image -} //end kernel - - -/** - * \brief cast values of an array of uint16 into a float output array. - * - * :param array_input: Pointer to global memory with the input data as unsigned16 array - * :param array_float: Pointer to global memory with the output data as float array - * :param width: Width of the image - * :param height: Height of the image - */ -kernel void u16_to_float(global unsigned short *array_input, - global float *array_float, - const int width, - const int height) -{ - //Global memory guard for padding - if ((get_global_id(0) < width) && (get_global_id(1)<height)) - { - int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); - for (int c=0; c<NB_COLOR; c++) - { - array_float[i + c] = (float) array_input[i + c]; - } //end loop over colors - } //end test in image -} //end kernel - -/** - * \brief cast values of an array of int16 into a float output array. - * - * :param array_input: Pointer to global memory with the input data as signed16 array - * :param array_float: Pointer to global memory with the output data as float array - * :param width: Width of the image - * :param height: Height of the image - */ -kernel void s16_to_float(global short *array_input, - global float *array_float, - const int width, - const int height) -{ - //Global memory guard for padding - if ((get_global_id(0) < width) && (get_global_id(1)<height)) - { - int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); - for (int c=0; c<NB_COLOR; c++) - { - array_float[i + c] = (float) array_input[i + c]; - } //end loop over colors - } //end test in image -}//end kernel - -/** - * \brief cast values of an array of uint32 into a float output array. - * - * :param array_input: Pointer to global memory with the input data as unsigned32 array - * :param array_float: Pointer to global memory with the output data as float array - * :param width: Width of the image - * :param height: Height of the image - */ -kernel void u32_to_float(global unsigned int *array_input, - global float *array_float, - const int width, - const int height) -{ - //Global memory guard for padding - if ((get_global_id(0) < width) && (get_global_id(1)<height)) - { - int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); - for (int c=0; c<NB_COLOR; c++) - { - array_float[i + c] = (float) array_input[i + c]; - } //end loop over colors - } //end test in image -}//end kernel - - -/** - * \brief convert values of an array of int32 into a float output array. - * - * :param array_input: Pointer to global memory with the data in int - * :param array_float: Pointer to global memory with the data in float - * :param width: Width of the image - * :param height: Height of the image - */ -kernel void s32_to_float(global int *array_input, - global float *array_float, - const int width, - const int height) -{ - //Global memory guard for padding - if ((get_global_id(0) < width) && (get_global_id(1)<height)) - { - int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); - for (int c=0; c<NB_COLOR; c++) - { - array_float[i + c] = (float) array_input[i + c]; - } //end loop over colors - } //end test in image -}//end kernel - diff --git a/silx/resources/opencl/image/histogram.cl b/silx/resources/opencl/image/histogram.cl deleted file mode 100644 index 7fb1485..0000000 --- a/silx/resources/opencl/image/histogram.cl +++ /dev/null @@ -1,178 +0,0 @@ -//CL// - -/* - * Project: SILX: Alogorithms for image processing - * - * Copyright (C) 2013-2018 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: J. Kieffer (kieffer@esrf.fr) - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* Single kernel histogram for float array. - * - * Can perform histograming in log-scale (using the arcsinh) - -Parameters: - - data: buffer with the image content in float (input) - - data_size: input - - mini: Lower bound of the first bin - - maxi: upper bouns of the last bin - - map_operation: Type of pre-processor to use. if if set to !=0, use log scale - - hist: resulting histogram (ouptut) - - hist_size: number of bins - - tmp_hist: temporary storage of size hist_size*num_groups - - processed: temporary storage of size 1 initialially set to 0 - - local_hist: local storage of size hist_size - - -Grid information: - * use the largest WG size. If it is larger than the number of bins (hist_size), - take the power of 2 immediately above - *Schedule as many WG as the device has compute engines. No need to schuedule more, - it is just a waist of memory - * The histogram should fit in shared (local) memory and tmp_hist can be large! - -Assumes: - hist and local_hist have size hist_size - edges has size hist_size+2 - tmp_hist has size hist_size*num_groups - processed is of size one and initially set to 0 - -*/ - - -static float preprocess(float value, int code) -{ - //This function can be modified to have more scales - if (code!=0) - { - return asinh(value); - } - else - { - return value; - } -} - -kernel void histogram(global float *data, - int data_size, - float mini, - float maxi, - int map_operation, - global int *hist, - global float *edges, - int hist_size, - global int *tmp_hist, - global int *processed, - local int *local_hist) -{ - // each thread - int lid = get_local_id(0); - int ws = get_local_size(0); - int nb_engine = get_num_groups(0); - int engine_id = get_group_id(0); - - // memset the local_hist array - for (int i=0; i<hist_size; i+=ws) - { - int j = i + lid; - if (j<hist_size) - { - local_hist[j] = 0; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Process the local data and accumulate in shared memory - int bloc_size = (int) ceil((float)data_size/(float)nb_engine); - int start = bloc_size * engine_id; - int stop = min(start + bloc_size, data_size); - float vmini = preprocess(mini, map_operation); - float vscale = (float)hist_size/(preprocess(maxi, map_operation) -vmini); - for (int i = start; i<stop; i+=ws) - { - int address = i + lid; - if (address < stop) - { - float value = data[address]; - if ((!isnan(value)) && (value>=mini) && (value<=maxi)) - { - float vvalue = (preprocess(value, map_operation)-vmini)*vscale; - int idx = clamp((int) vvalue, 0, hist_size - 1); - atomic_inc(&local_hist[idx]); - } - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - //Now copy result into the right place and reset the first value of the shared array - for (int i=0; i<hist_size; i+=ws) - { - int j = i + lid; - if (j<hist_size) - { - tmp_hist[hist_size * engine_id + j] = local_hist[j]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - local_hist[0] = 0; - - barrier(CLK_LOCAL_MEM_FENCE); - - //Increment the system wide shared variable processed and share the result with the workgroup - if (lid == 0) - { - local_hist[0] = atomic_inc(processed); - } - barrier(CLK_LOCAL_MEM_FENCE); - - // If we are the engine last to work, perform the concatenation of all results - - if ((local_hist[0] + 1) == nb_engine) - { - for (int i=0; i<hist_size; i+=ws) - { - int j = i + lid; - int lsum = 0; - if (j<hist_size) - { - for (int k=0; k<nb_engine; k++) - { - lsum += tmp_hist[hist_size * k + j]; - } - hist[j] = lsum; - edges[j] = vmini + j/vscale; - } - } - // Finally reset the counter - if (lid == 0) - { - processed[0] = 0; - edges[hist_size] = vmini + hist_size/vscale;; - } - - } -} diff --git a/silx/resources/opencl/image/map.cl b/silx/resources/opencl/image/map.cl deleted file mode 100644 index 804a5a1..0000000 --- a/silx/resources/opencl/image/map.cl +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Project: SILX: Data analysis library - * kernel for maximum and minimum calculation - * - * - * Copyright (C) 2013-2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: J. Kieffer (kieffer@esrf.fr) - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * - */ - -#ifndef NB_COLOR - #define NB_COLOR 1 -#endif - - -/** - * \brief Linear scale of signal in an image (maybe multi-color) - * - * :param image: contains the input image - * :param output: contains the output image after scaling - * :param max_min_input: 2-floats containing the maximum and the minimum value of image - * :param minimum: float scalar with the minimum of the output - * :param maximum: float scalar with the maximum of the output - * :param width: integer, number of columns the matrices - * :param height: integer, number of lines of the matrices - * - * - */ - - -kernel void normalize_image(global float* image, - global float* output, - int width, - int height, - global float* max_min_input, - float minimum, - float maximum - ) -{ - //Global memory guard for padding - if((get_global_id(0) < width) && (get_global_id(1)<height)) - { - int idx = NB_COLOR* (get_global_id(0) + width * get_global_id(1)); - float mini_in, maxi_in, scale; - maxi_in = max_min_input[0]; - mini_in = max_min_input[1]; - if (maxi_in == mini_in) - { - scale = NAN; - } - else - { - scale = (maximum - minimum) / (maxi_in - mini_in); - } - - for (int c=0; c<NB_COLOR; c++) - { - output[idx+c] = fma(scale, image[idx+c]-mini_in, minimum); - } // end color loop - }//end if in IMAGE -}//end kernel diff --git a/silx/resources/opencl/image/max_min.cl b/silx/resources/opencl/image/max_min.cl deleted file mode 100644 index 246cd48..0000000 --- a/silx/resources/opencl/image/max_min.cl +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Project: SILX: Data analysis library - * kernel for maximum and minimum calculation - * - * - * Copyright (C) 2013-2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: J. Kieffer (kieffer@esrf.fr) - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * - */ - -#ifndef NB_COLOR - #define NB_COLOR 1 -#endif - - - -#define REDUCE(a, b) ((float2) (fmax(a.x, b.x), fmin(a.y, b.y))) - -static float2 read_and_map(int idx, - global float* data) -{ - idx *= NB_COLOR; - float tmp = data[idx]; - float2 res = (float2) (tmp, tmp); - if (NB_COLOR > 1) - { - for (int c=1; c<NB_COLOR; c++) - { - tmp = data[idx + c]; - res = (float2) (fmax(res.x, tmp), fmin(res.y, tmp)); - } - } - return res; -} - - -/** - * \brief max_min_global_stage1: Look for the maximum an the minimum of an array. stage1 - * - * optimal workgroup size: 2^n greater than sqrt(size), limited to 512 - * optimal total item size: (workgroup size)^2 - * if size >total item size: adjust seq_count. - * - * :param data: Float pointer to global memory storing the vector of data. - * :param out: Float2 pointer to global memory storing the temporary results (workgroup size) - * :param seq_count: how many blocksize each thread should read - * :param size: size of the problem (number of element in the image - * :param l_data Shared memory: 2 float per thread in workgroup - * -**/ - - -kernel void max_min_reduction_stage1( global const float *data, - global float2 *out, - int size, - local float2 *l_data)// local storage 2 float per thread -{ - int group_size = get_local_size(0); - int lid = get_local_id(0); - float2 acc; - int big_block = group_size * get_num_groups(0); - int i = lid + group_size * get_group_id(0); - - if (lid<size) - acc = read_and_map(lid, data); - else - acc = read_and_map(0, data); - - // Linear pre-reduction stage 0 - - while (i<size){ - acc = REDUCE(acc, read_and_map(i, data)); - i += big_block; - } - - // parallel reduction stage 1 - - l_data[lid] = acc; - barrier(CLK_LOCAL_MEM_FENCE); - for (int block=group_size/2; block>1; block/=2) - { - if ((lid < block) && ((lid + block)<group_size)){ - l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (lid == 0) - { - if (group_size > 1) - { - acc = REDUCE(l_data[0], l_data[1]); - } - else - { - acc = l_data[0]; - } - out[get_group_id(0)] = acc; - } -} - - -/** - * \brief global_max_min: Look for the maximum an the minimum of an array. - * - * - * - * :param data2: Float2 pointer to global memory storing the vector of pre-reduced data (workgroup size). - * :param maximum: Float pointer to global memory storing the maximum value - * :param minumum: Float pointer to global memory storing the minimum value - * :param l_data Shared memory: 2 float per thread in workgroup - * -**/ - -kernel void max_min_reduction_stage2( - global const float2 *data2, - global float2 *maxmin, - local float2 *l_data)// local storage 2 float per thread -{ - int lid = get_local_id(0); - int group_size = get_local_size(0); - float2 acc = (float2)(-1.0f, -1.0f); - if (lid<=group_size) - { - l_data[lid] = data2[lid]; - } - else - { - l_data[lid] = acc; - } - - // parallel reduction stage 2 - - - barrier(CLK_LOCAL_MEM_FENCE); - for (int block=group_size/2; block>1; block/=2) - { - if ((lid < block) && ((lid + block)<group_size)) - { - l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - } - - if (lid == 0 ) - { - if ( group_size > 1) - { - acc = REDUCE(l_data[0], l_data[1]); - } - else - { - acc = l_data[0]; - } - maxmin[0] = acc; - } -} - -/*This is the serial version of the min_max kernel. - * - * It has to be launched with WG=1 and only 1 WG has to be launched ! - * - * :param data: Float pointer to global memory storing the vector of data. - * :param size: size of the - * :param maximum: Float pointer to global memory storing the maximum value - * :param minumum: Float pointer to global memory storing the minimum value - * - * - */ -kernel void max_min_serial( - global const float *data, - unsigned int size, - global float2 *maxmin - ) -{ - float2 acc = read_and_map(0, data); - for (int i=1; i<size; i++) - { - acc = REDUCE(acc, read_and_map(i, data)); - } - - maxmin[0] = acc; -} diff --git a/silx/resources/opencl/kahan.cl b/silx/resources/opencl/kahan.cl deleted file mode 100644 index c23d84d..0000000 --- a/silx/resources/opencl/kahan.cl +++ /dev/null @@ -1,143 +0,0 @@ -/* - * OpenCL library for 32-bits floating point calculation using compensated arithmetics - */ - -/* Nota: i386 computer use x87 registers which are larger than the 32bits precision - * which can invalidate the error compensation mechanism. - * - * We use the trick to declare some variable "volatile" to enforce the actual - * precision reduction of those variables. -*/ - -#ifndef X87_VOLATILE -# define X87_VOLATILE -#endif - -// calculate acc.s0+value with error compensation -// see https://en.wikipedia.org/wiki/Kahan_summation_algorithm -// unlike wikipedia, here the exact value = acc.s0 + acc.s1 (performed in higher precision) - -static inline float2 kahan_sum(float2 acc, float value) -{ - if (value) - { - float sum = acc.s0; - float err = acc.s1; - if (fabs(value) > fabs(sum)) - { - float tmp = sum; - sum = value; - value = tmp; - } - - float cor = value + err; - X87_VOLATILE float target = sum + cor; - err = cor - (target - sum); - return (float2)(target, err); - } - else - { - return acc; - } -} - -// calculate a*b with error compensation -// see https://hal.archives-ouvertes.fr/hal-01367769/document -static inline float2 comp_prod(float a, float b) -{ - X87_VOLATILE float x = a * b; - float y = fma(a, b, -x); - return (float2)(x, y); -} - -// calculate a + b with error compensation -static inline float2 compensated_sum(float2 a, float2 b) -{ - float err = a.s1 + b.s1; - float first = a.s0; - float second = b.s0; - if (fabs(second) > fabs(first)) - {//swap first and second - float tmp = first; - first = second; - second = tmp; - } - float cor = second + err; - X87_VOLATILE float target = first + cor; - err = cor - (target - first); - return (float2)(target, err); -} - -// calculate a * b with error compensation -static inline float2 compensated_mul(float2 a, float2 b) -{ - float2 tmp; - tmp = kahan_sum((float2)(a.s0*b.s0, a.s0*b.s1), a.s1*b.s0); - tmp = kahan_sum(tmp, a.s1*b.s1); - return tmp; -} - -// calculate 1/a with error compensation (Needs validation) -static inline float2 compensated_inv(float2 a) -{ - float main = a.s0; - float err = a.s1; - return (float2)(1.0f/main, -err/(main*main)); -} - -// calculate a/b with error compensation (Needs validation) -static inline float2 compensated_div(float2 a, float2 b) -{ - float ah = a.s0; - float al = a.s1; - float bh = b.s0; - float bl = b.s1; - float bl_over_bh = bl/bh; - return kahan_sum(kahan_sum(a, -a.s1 * bl_over_bh), -a.s0 * bl_over_bh) / bh; -} - - -// calculate a.b where a and b are float2 -static inline float2 comp_dot2(float2 a, float2 b) -{ - return compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)); -} - -// calculate a.b where a and b are float3 -static inline float2 comp_dot3(float3 a, float3 b) -{ - return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), comp_prod(a.s2, b.s2)); -} - -// calculate a.b where a and b are float4 -static inline float2 comp_dot4(float4 a, float4 b) -{ - return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), - compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))); -} - -// calculate a.b where a and b are float8 -static inline float2 comp_dot8(float8 a, float8 b) -{ - return compensated_sum( - compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), - compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))), - compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)), - compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7)))); -} - -// calculate a.b where a and b are float8 -static inline float2 comp_dot16(float16 a, float16 b) -{ - return compensated_sum( - compensated_sum( - compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), - compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))), - compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)), - compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7)))), - compensated_sum( - compensated_sum(compensated_sum(comp_prod(a.s8, b.s8), comp_prod(a.s9, b.s9)), - compensated_sum(comp_prod(a.sa, b.sa), comp_prod(a.sb, b.sb))), - compensated_sum(compensated_sum(comp_prod(a.sc, b.sc), comp_prod(a.sd, b.sd)), - compensated_sum(comp_prod(a.se, b.se), comp_prod(a.sf, b.sf))))); -} diff --git a/silx/resources/opencl/linalg.cl b/silx/resources/opencl/linalg.cl deleted file mode 100644 index 8710528..0000000 --- a/silx/resources/opencl/linalg.cl +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (C) 2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -/** - * - * Compute the spatial gradient of an image. - * - * slice: input image - * slice_grad: output gradient - * sizeX: number of columns of the image - * sizeY: number of rows of the image - * - **/ -kernel void kern_gradient2D( - global float* slice, - global float2* slice_grad, - int sizeX, - int sizeY) -{ - - int gidx = (int) get_global_id(0); - int gidy = (int) get_global_id(1); - - if ((gidx < sizeX) && (gidy < sizeY)) - { - // Note the direction inconstancy ! (JK 07/2018) - - float val_y = (gidx == (sizeX-1))? 0: slice[gidy*sizeX+gidx+1] - slice[gidy*sizeX+gidx]; - float val_x = (gidy == (sizeY-1))? 0: slice[(gidy+1)*sizeX+gidx] - slice[(gidy)*sizeX+gidx]; - - slice_grad[gidy*sizeX+gidx].x = val_x; - slice_grad[gidy*sizeX+gidx].y = val_y; - } -} - -/** - * - * Compute the spatial divergence of an image gradient. - * - * slice_grad: input gradient-like image - * slice: output image - * sizeX: number of columns of the input - * sizeY: number of rows of the input - * - **/ -kernel void kern_divergence2D( - global float2* slice_grad, - global float* slice, - int sizeX, - int sizeY) -{ - int gidx = (int) get_global_id(0); - int gidy = (int) get_global_id(1); - - if (gidx < sizeX && gidy < sizeY) - { - float val_x, val_y; - val_y = (gidx == 0)? - slice_grad[(gidy)*sizeX+gidx].y : - slice_grad[(gidy)*sizeX+gidx].y - slice_grad[(gidy)*sizeX+gidx-1].y; - val_x = (gidy == 0)? - slice_grad[(gidy)*sizeX+gidx].x: - slice_grad[(gidy)*sizeX+gidx].x - slice_grad[(gidy-1)*sizeX+gidx].x; - slice[gidy*sizeX+gidx] = val_x + val_y; - } -} diff --git a/silx/resources/opencl/medfilt.cl b/silx/resources/opencl/medfilt.cl deleted file mode 100644 index 0680230..0000000 --- a/silx/resources/opencl/medfilt.cl +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Project: Azimuthal regroupping OpenCL kernel for PyFAI. - * Median filter for 1D, 2D and 3D datasets, only 2D for now - * - * - * Copyright (C) 2017-2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: J. Kieffer (kieffer@esrf.fr) - * Last revision: 07/02/2017 - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -/* - * Needs to be concatenated with bitonic.cl prior to compilation -*/ - -/* - * Perform the 2D median filtering of an image 2D image - * - * dim0 => wg=number_of_element in the tile /8 - * dim1 = x: wg=1 - - * - * Actually the workgoup size is a bit more complicated: - * if window size = 1,3,5,7: WG1=8 - * if window size = 9,11,13,15: WG1=32 - * if window size = 17, ...,21: WG1=64 - * - * More Generally the workgroup size must be: at least: kfs2*(kfs1+7)/8 - * Each thread treats 8 values aligned vertically, this allows (almost) - * coalesced reading between threads in one line of the tile. - * - * Later on it will be more efficient to re-use the same tile - * and slide it vertically by one line. - * The additionnal need for shared memory will be kfs2 floats and a float8 as register. - * - * Theoritically, it should be possible to handle up to windows-size 83x83 - */ -__kernel void medfilt2d(__global float *image, // input image - __global float *result, // output array - __local float4 *l_data,// local storage 4x the number of threads - int khs1, // Kernel half-size along dim1 (nb lines) - int khs2, // Kernel half-size along dim2 (nb columns) - int height, // Image size along dim1 (nb lines) - int width) // Image size along dim2 (nb columns) -{ - int threadid = get_local_id(0); - //int wg = get_local_size(0); - int x = get_global_id(1); - - if (x < width) - { - union - { - float ary[8]; - float8 vec; - } output, input; - input.vec = (float8)(MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT); - int kfs1 = 2 * khs1 + 1; //definition of kernel full size - int kfs2 = 2 * khs2 + 1; - int nbands = (kfs1 + 7) / 8; // 8 elements per thread, aligned vertically in 1 column - for (int y=0; y<height; y++) - { - //Select only the active threads, some may remain inactive - int nb_threads = (nbands * kfs2); - int band_nr = threadid / kfs2; - int band_id = threadid % kfs2; - int pos_x = clamp((int)(x + band_id - khs2), (int) 0, (int) width-1); - int max_vec = clamp(kfs1 - 8 * band_nr, 0, 8); - if (y == 0) - { - for (int i=0; i<max_vec; i++) - { - if (threadid<nb_threads) - { - int pos_y = clamp((int)(y + 8 * band_nr + i - khs1), (int) 0, (int) height-1); - input.ary[i] = image[pos_x + width * pos_y]; - } - } - } - else - { - //store storage.s0 to some shared memory to retrieve it from another thread. - l_data[threadid].s0 = input.vec.s0; - - //Offset to the bottom - input.vec = (float8)(input.vec.s1, - input.vec.s2, - input.vec.s3, - input.vec.s4, - input.vec.s5, - input.vec.s6, - input.vec.s7, - MAXFLOAT); - - barrier(CLK_LOCAL_MEM_FENCE); - - int read_from = threadid + kfs2; - if (read_from < nb_threads) - input.vec.s7 = l_data[read_from].s0; - else if (threadid < nb_threads) //we are on the last band - { - int pos_y = clamp((int)(y + 8 * band_nr + max_vec - 1 - khs1), (int) 0, (int) height-1); - input.ary[max_vec - 1] = image[pos_x + width * pos_y]; - } - - } - - //This function is defined in bitonic.cl - output.vec = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), - input.vec, l_data); - - size_t target = (kfs1 * kfs2) / 2; - if (threadid == (target / 8)) - { - result[y * width + x] = output.ary[target % 8]; - } - - } - } -} - diff --git a/silx/resources/opencl/preprocess.cl b/silx/resources/opencl/preprocess.cl deleted file mode 100644 index de35c48..0000000 --- a/silx/resources/opencl/preprocess.cl +++ /dev/null @@ -1,567 +0,0 @@ -/* - * Project: Azimuthal regroupping OpenCL kernel for PyFAI. - * Preprocessing program - * - * - * Copyright (C) 2012-2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: J. Kieffer (kieffer@esrf.fr) - * Last revision: 19/01/2017 - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -/** - * \file - * - * \brief OpenCL kernels for image array casting, array mem-setting and normalizing - * - * Constant to be provided at build time: - * NIMAGE: size of the image - * - */ - -#include "for_eclipse.h" - -/** - * \brief cast values of an array of int8 into a float output array. - * - * - array_s8: Pointer to global memory with the input data as signed8 array - * - array_float: Pointer to global memory with the output data as float array - */ -__kernel void -s8_to_float(__global char *array_s8, - __global float *array_float -) -{ - int i = get_global_id(0); - //Global memory guard for padding - if (i < NIMAGE) - array_float[i] = (float)array_s8[i]; -} - - -/** - * \brief cast values of an array of uint8 into a float output array. - * - * - array_u8: Pointer to global memory with the input data as unsigned8 array - * - array_float: Pointer to global memory with the output data as float array - */ -__kernel void -u8_to_float(__global unsigned char *array_u8, - __global float *array_float -) -{ - int i = get_global_id(0); - //Global memory guard for padding - if (i < NIMAGE) - array_float[i] = (float)array_u8[i]; -} - - -/** - * \brief cast values of an array of int16 into a float output array. - * - * - array_s16: Pointer to global memory with the input data as signed16 array - * - array_float: Pointer to global memory with the output data as float array - */ -__kernel void -s16_to_float(__global short *array_s16, - __global float *array_float -) -{ - int i = get_global_id(0); - //Global memory guard for padding - if (i < NIMAGE) - array_float[i] = (float)array_s16[i]; -} - - -/** - * \brief cast values of an array of uint16 into a float output array. - * - * - array_u16: Pointer to global memory with the input data as unsigned16 array - * - array_float: Pointer to global memory with the output data as float array - */ -__kernel void -u16_to_float(__global unsigned short *array_u16, - __global float *array_float -) -{ - int i = get_global_id(0); - //Global memory guard for padding - if (i < NIMAGE) - array_float[i] = (float)array_u16[i]; -} - -/** - * \brief cast values of an array of uint32 into a float output array. - * - * - array_u32: Pointer to global memory with the input data as unsigned32 array - * - array_float: Pointer to global memory with the output data as float array - */ -__kernel void -u32_to_float(__global unsigned int *array_u32, - __global float *array_float -) -{ - int i = get_global_id(0); - //Global memory guard for padding - if (i < NIMAGE) - array_float[i] = (float)array_u32[i]; -} - -/** - * \brief convert values of an array of int32 into a float output array. - * - * - array_int: Pointer to global memory with the data as unsigned32 array - * - array_float: Pointer to global memory with the data float array - */ -__kernel void -s32_to_float(__global int *array_int, - __global float *array_float -) -{ - int i = get_global_id(0); - //Global memory guard for padding - if (i < NIMAGE) - array_float[i] = (float)(array_int[i]); -} - - -/** - * Functions to be called from an actual kernel. - * \brief Performs the normalization of input image by dark subtraction, - * variance is propagated to second member of the float3 - * flatfield, solid angle, polarization and absorption are stored in - * third member of the float3 returned. - * - * Invalid/Dummy pixels will all have the third-member set to 0, i.e. no weight - * - * - image Float pointer to global memory storing the input image. - * - do_dark Bool/int: shall dark-current correction be applied ? - * - dark Float pointer to global memory storing the dark image. - * - do_flat Bool/int: shall flat-field correction be applied ? - * - flat Float pointer to global memory storing the flat image. - * - do_solidangle Bool/int: shall flat-field correction be applied ? - * - solidangle Float pointer to global memory storing the solid angle of each pixel. - * - do_polarization Bool/int: shall polarization correction be applied ? - * - polarization Float pointer to global memory storing the polarization of each pixel. - * - do_absorption Bool/int: shall absorption correction be applied ? - * - absorption Float pointer to global memory storing the effective absoption of each pixel. - * - do_mask perform mask correction ? - * - mask Bool/char pointer to mask array - * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored - * - dummy Float: value for bad pixels - * - delta_dummy Float: precision for bad pixel value - * - normalization_factor : divide the input by this value - * -**/ - -static float3 _preproc3(const __global float *image, - const __global float *variance, - const char do_dark, - const __global float *dark, - const char do_dark_variance, - const __global float *dark_variance, - const char do_flat, - const __global float *flat, - const char do_solidangle, - const __global float *solidangle, - const char do_polarization, - const __global float *polarization, - const char do_absorption, - const __global float *absorption, - const char do_mask, - const __global char *mask, - const char do_dummy, - const float dummy, - const float delta_dummy, - const float normalization_factor) -{ - size_t i= get_global_id(0); - float3 result = (float3)(0.0, 0.0, 0.0); - if (i < NIMAGE) - { - if ((!do_mask) || (!mask[i])) - { - result.s0 = image[i]; - if (variance != 0) - result.s1 = variance[i]; - result.s2 = normalization_factor; - if ( (!do_dummy) - ||((delta_dummy != 0.0f) && (fabs(result.s0-dummy) > delta_dummy)) - ||((delta_dummy == 0.0f) && (result.s0 != dummy))) - { - if (do_dark) - result.s0 -= dark[i]; - if (do_dark_variance) - result.s1 += dark_variance[i]; - if (do_flat) - { - float one_flat = flat[i]; - if ( (!do_dummy) - ||((delta_dummy != 0.0f) && (fabs(one_flat-dummy) > delta_dummy)) - ||((delta_dummy == 0.0f) && (one_flat != dummy))) - result.s2 *= one_flat; - else - result.s2 = 0.0f; - } - if (do_solidangle) - result.s2 *= solidangle[i]; - if (do_polarization) - result.s2 *= polarization[i]; - if (do_absorption) - result.s2 *= absorption[i]; - if (isnan(result.s0) || isnan(result.s1) || isnan(result.s2) || (result.s2 == 0.0f)) - result = (float3)(0.0, 0.0, 0.0); - } - else - { - result = (float3)(0.0, 0.0, 0.0); - }//end if do_dummy - } // end if mask - };//end if NIMAGE - return result; -};//end function - - -/** - * \brief Performs the normalization of input image by dark subtraction, - * flatfield, solid angle, polarization and absorption division. - * - * Intensities of images are corrected by: - * - dark (read-out) noise subtraction - * - Solid angle correction (division) - * - polarization correction (division) - * - flat fiels correction (division) - * Corrections are made in place unless the pixel is dummy. - * Dummy pixels are left untouched so that they remain dummy - * - * - image Float pointer to global memory storing the input image. - * - do_dark Bool/int: shall dark-current correction be applied ? - * - dark Float pointer to global memory storing the dark image. - * - do_flat Bool/int: shall flat-field correction be applied ? - * - flat Float pointer to global memory storing the flat image. - * - do_solidangle Bool/int: shall flat-field correction be applied ? - * - solidangle Float pointer to global memory storing the solid angle of each pixel. - * - do_polarization Bool/int: shall polarization correction be applied ? - * - polarization Float pointer to global memory storing the polarization of each pixel. - * - do_absorption Bool/int: shall absorption correction be applied ? - * - absorption Float pointer to global memory storing the effective absoption of each pixel. - * - do_mask perform mask correction ? - * - mask Bool/char pointer to mask array - * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored - * - dummy Float: value for bad pixels - * - delta_dummy Float: precision for bad pixel value - * - normalization_factor : divide the input by this value - * -**/ - -__kernel void -corrections(const __global float *image, - const char do_dark, - const __global float *dark, - const char do_flat, - const __global float *flat, - const char do_solidangle, - const __global float *solidangle, - const char do_polarization, - const __global float *polarization, - const char do_absorption, - const __global float *absorption, - const char do_mask, - const __global char *mask, - const char do_dummy, - const float dummy, - const float delta_dummy, - const float normalization_factor, - __global float *output - ) -{ - size_t i= get_global_id(0); - float3 result = (float3)(0.0, 0.0, 0.0); - if (i < NIMAGE) - { - result = _preproc3(image, - 0, - do_dark, - dark, - 0, - 0, - do_flat, - flat, - do_solidangle, - solidangle, - do_polarization, - polarization, - do_absorption, - absorption, - do_mask, - mask, - do_dummy, - dummy, - delta_dummy, - normalization_factor); - if (result.s2 != 0.0f) - output[i] = result.s0 / result.s2; - else - output[i] = dummy; - };//end if NIMAGE - -};//end kernel - - -/** - * \brief Performs Normalization of input image with float2 output (num,denom) - * - * Intensities of images are corrected by: - * - dark (read-out) noise subtraction for the data - * - Solid angle correction (denominator) - * - polarization correction (denominator) - * - flat fiels correction (denominator) - * - * Corrections are made out of place. - * Dummy pixels set both the numerator and denominator to 0 - * - * - image Float pointer to global memory storing the input image. - * - do_dark Bool/int: shall dark-current correction be applied ? - * - dark Float pointer to global memory storing the dark image. - * - do_flat Bool/int: shall flat-field correction be applied ? - * - flat Float pointer to global memory storing the flat image. - * - do_solidangle Bool/int: shall flat-field correction be applied ? - * - solidangle Float pointer to global memory storing the solid angle of each pixel. - * - do_polarization Bool/int: shall flat-field correction be applied ? - * - polarization Float pointer to global memory storing the polarization of each pixel. - * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored - * - dummy Float: value for bad pixels - * - delta_dummy Float: precision for bad pixel value - * - normalization_factor : divide the input by this value - * - * -**/ -__kernel void -corrections2(const __global float *image, - const char do_dark, - const __global float *dark, - const char do_flat, - const __global float *flat, - const char do_solidangle, - const __global float *solidangle, - const char do_polarization, - const __global float *polarization, - const char do_absorption, - const __global float *absorption, - const char do_mask, - const __global char *mask, - const char do_dummy, - const float dummy, - const float delta_dummy, - const float normalization_factor, - __global float2 *output - ) -{ - size_t i= get_global_id(0); - float3 result = (float3)(0.0, 0.0, 0.0); - if (i < NIMAGE) - { - result = _preproc3(image, - 0, - do_dark, - dark, - 0, - 0, - do_flat, - flat, - do_solidangle, - solidangle, - do_polarization, - polarization, - do_absorption, - absorption, - do_mask, - mask, - do_dummy, - dummy, - delta_dummy, - normalization_factor); - output[i] = (float2)(result.s0, result.s2); - };//end if NIMAGE -};//end kernel - -/** - * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) assuming poissonian signal - * - * Intensities of images are corrected by: - * - dark (read-out) noise subtraction for the data - * - Solid angle correction (denominator) - * - polarization correction (denominator) - * - flat fiels correction (denominator) - * - * Corrections are made out of place. - * Dummy pixels set both the numerator and denominator to 0 - * - * - image Float pointer to global memory storing the input image. - * - do_dark Bool/int: shall dark-current correction be applied ? - * - dark Float pointer to global memory storing the dark image. - * - do_flat Bool/int: shall flat-field correction be applied ? - * - flat Float pointer to global memory storing the flat image. - * - do_solidangle Bool/int: shall flat-field correction be applied ? - * - solidangle Float pointer to global memory storing the solid angle of each pixel. - * - do_polarization Bool/int: shall flat-field correction be applied ? - * - polarization Float pointer to global memory storing the polarization of each pixel. - * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored - * - dummy Float: value for bad pixels - * - delta_dummy Float: precision for bad pixel value - * - normalization_factor : divide the input by this value - * - * -**/ -__kernel void -corrections3Poisson( const __global float *image, - const char do_dark, - const __global float *dark, - const char do_flat, - const __global float *flat, - const char do_solidangle, - const __global float *solidangle, - const char do_polarization, - const __global float *polarization, - const char do_absorption, - const __global float *absorption, - const char do_mask, - const __global char *mask, - const char do_dummy, - const float dummy, - const float delta_dummy, - const float normalization_factor, - __global float3 *output - ) -{ - size_t i= get_global_id(0); - float3 result = (float3)(0.0, 0.0, 0.0); - if (i < NIMAGE) - { - result = _preproc3(image, - image, - do_dark, - dark, - do_dark, - dark, - do_flat, - flat, - do_solidangle, - solidangle, - do_polarization, - polarization, - do_absorption, - absorption, - do_mask, - mask, - do_dummy, - dummy, - delta_dummy, - normalization_factor); - output[i] = result; - };//end if NIMAGE -};//end kernel - - -/** - * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) - * - * Intensities of images are corrected by: - * - dark (read-out) noise subtraction for the data - * - Solid angle correction (division) - * - polarization correction (division) - * - flat fiels correction (division) - * Corrections are made in place unless the pixel is dummy. - * Dummy pixels are left untouched so that they remain dummy - * - * - image Float pointer to global memory storing the input image. - * - do_dark Bool/int: shall dark-current correction be applied ? - * - dark Float pointer to global memory storing the dark image. - * - do_flat Bool/int: shall flat-field correction be applied ? - * - flat Float pointer to global memory storing the flat image. - * - do_solidangle Bool/int: shall flat-field correction be applied ? - * - solidangle Float pointer to global memory storing the solid angle of each pixel. - * - do_polarization Bool/int: shall flat-field correction be applied ? - * - polarization Float pointer to global memory storing the polarization of each pixel. - * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored - * - dummy Float: value for bad pixels - * - delta_dummy Float: precision for bad pixel value - * - normalization_factor : divide the input by this value - * - * -**/ - -__kernel void -corrections3(const __global float *image, - const __global float *variance, - const char do_dark, - const __global float *dark, - const char do_dark_variance, - const __global float *dark_variance, - const char do_flat, - const __global float *flat, - const char do_solidangle, - const __global float *solidangle, - const char do_polarization, - const __global float *polarization, - const char do_absorption, - const __global float *absorption, - const char do_mask, - const __global char *mask, - const char do_dummy, - const float dummy, - const float delta_dummy, - const float normalization_factor, - __global float3 *output - ) -{ - size_t i= get_global_id(0); - float3 result = (float3)(0.0, 0.0, 0.0); - if (i < NIMAGE) - { - result = _preproc3( image, - variance, - do_dark, - dark, - do_dark_variance, - dark_variance, - do_flat, - flat, - do_solidangle, - solidangle, - do_polarization, - polarization, - do_absorption, - absorption, - do_mask, - mask, - do_dummy, - dummy, - delta_dummy, - normalization_factor); - output[i] = result; - };//end if NIMAGE -};//end kernel - - diff --git a/silx/resources/opencl/proj.cl b/silx/resources/opencl/proj.cl deleted file mode 100644 index 2a6d870..0000000 --- a/silx/resources/opencl/proj.cl +++ /dev/null @@ -1,345 +0,0 @@ -/* - * Copyright (C) 2017-2017 European Synchrotron Radiation Facility - * Grenoble, France - * - * Based on the projector of PyHST2 - https://forge.epn-campus.eu/projects/pyhst2 - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ - -/*******************************************************************************/ -/************************ GPU VERSION (with textures) **************************/ -/*******************************************************************************/ - -#ifndef DONT_USE_TEXTURES -kernel void forward_kernel( - global float *d_Sino, - read_only image2d_t d_slice, - int dimslice, - int num_bins, - global float* angles_per_project , - float axis_position, - global float *d_axis_corrections, - global int* d_beginPos , - global int* d_strideJoseph, - global int* d_strideLine , - int num_projections, - int dimrecx, - int dimrecy, - float cpu_offset_x, - float cpu_offset_y, - int josephnoclip, - int normalize) -{ - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; - const int tidx = get_local_id(0); - const int bidx = get_group_id(0); - const int tidy = get_local_id(1); - const int bidy = get_group_id(1); - float angle; - float cos_angle,sin_angle ; - - local float corrections[16]; - local int beginPos[16*2]; - local int strideJoseph[16*2]; - local int strideLine[16*2]; - - // thread will use corrections[tidy] - // All are read by first warp - int offset, OFFSET; - switch(tidy) { - case 0: - corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx]; - break; - case 1: - case 2: - offset = 16*(tidy-1); - OFFSET = dimrecy*(tidy-1); - beginPos [offset + tidx ]= d_beginPos[ OFFSET+ bidy*16+tidx] ; - break; - case 3: - case 4: - offset = 16*(tidy-3); - OFFSET = dimrecy*(tidy-3); - strideJoseph[offset + tidx ]= d_strideJoseph[OFFSET + bidy*16+tidx] ; - break; - case 5: - case 6: - offset = 16*(tidy-5); - OFFSET = dimrecy*(tidy-5); - strideLine[offset + tidx ]= d_strideLine[OFFSET + bidy*16+tidx] ; - break; - } - barrier(CLK_LOCAL_MEM_FENCE); - - angle = angles_per_project[ bidy*16+tidy ] ; - cos_angle = cos(angle); - sin_angle = sin(angle); - - if(fabs(cos_angle) > 0.70710678f ) { - if( cos_angle>0) { - cos_angle = cos(angle); - sin_angle = sin(angle); - } - else { - cos_angle = -cos(angle); - sin_angle = -sin(angle); - } - } - else { - if( sin_angle>0) { - cos_angle = sin(angle); - sin_angle = -cos(angle); - } - else { - cos_angle = -sin(angle); - sin_angle = cos(angle); - } - } - float res=0.0f; - float axis_corr = axis_position + corrections[ tidy ]; - float axis = axis_position ; - float xpix = ( bidx*16+tidx )-cpu_offset_x; - float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ; - - float shiftJ = sin_angle/cos_angle; - float x1 = min(-sin_angle/cos_angle ,0.f); - float x2 = max(-sin_angle/cos_angle ,0.f); - - float Area; - Area=1.0f/cos_angle; - int stlA, stlB , stlAJ, stlBJ ; - stlA=strideLine[16+tidy]; - stlB=strideLine[tidy]; - stlAJ=strideJoseph[16+tidy]; - stlBJ=strideJoseph[tidy]; - - int beginA = beginPos[16+tidy ]; - int beginB = beginPos[tidy ]; - float add; - int l; - - if(josephnoclip) { - for(int j=0; j<dimslice; j++) { // j: Joseph - x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f; - x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f; - add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2); - res += add; - posx += shiftJ; - } - } - else { - for(int j=0; j<dimslice; j++) { // j: Joseph - x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f; - x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f; - l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ; - add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2); - res += add*l; - posx += shiftJ; - } - } - - if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) { - res *= Area; - if (normalize) - res *= M_PI_F * 0.5f / num_projections; - d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res; - } -} -#endif - - -/*******************************************************************************/ -/********************* CPU VERSION (without textures) **************************/ -/*******************************************************************************/ - - -kernel void forward_kernel_cpu( - global float *d_Sino, - global float* d_slice, - int dimslice, - int num_bins, - global float* angles_per_project , - float axis_position, - global float *d_axis_corrections, - global int* d_beginPos , - global int* d_strideJoseph, - global int* d_strideLine , - int num_projections, - int dimrecx, - int dimrecy, - float cpu_offset_x, - float cpu_offset_y, - int josephnoclip, - int normalize) -{ - - const int tidx = get_local_id(0); - const int bidx = get_group_id(0); - const int tidy = get_local_id(1); - const int bidy = get_group_id(1); - float angle; - float cos_angle,sin_angle ; - - local float corrections[16]; - local int beginPos[16*2]; - local int strideJoseph[16*2]; - local int strideLine[16*2]; - - // thread will use corrections[tidy] - // All are read by first warp - int offset, OFFSET; - switch(tidy) { - case 0: - corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx]; - break; - case 1: - case 2: - offset = 16*(tidy-1); - OFFSET = dimrecy*(tidy-1); - beginPos [offset + tidx ]= d_beginPos[ OFFSET+ bidy*16+tidx] ; - break; - case 3: - case 4: - offset = 16*(tidy-3); - OFFSET = dimrecy*(tidy-3); - strideJoseph[offset + tidx ]= d_strideJoseph[OFFSET + bidy*16+tidx] ; - break; - case 5: - case 6: - offset = 16*(tidy-5); - OFFSET = dimrecy*(tidy-5); - strideLine[offset + tidx ]= d_strideLine[OFFSET + bidy*16+tidx] ; - break; - } - barrier(CLK_LOCAL_MEM_FENCE); - - angle = angles_per_project[ bidy*16+tidy ] ; - cos_angle = cos(angle); - sin_angle = sin(angle); - - if(fabs(cos_angle) > 0.70710678f ) { - if( cos_angle>0) { - cos_angle = cos(angle); - sin_angle = sin(angle); - } - else { - cos_angle = -cos(angle); - sin_angle = -sin(angle); - } - } - else { - if( sin_angle>0) { - cos_angle = sin(angle); - sin_angle = -cos(angle); - } - else { - cos_angle = -sin(angle); - sin_angle = cos(angle); - } - } - float res=0.0f; - float axis_corr = axis_position + corrections[ tidy ]; - float axis = axis_position ; - float xpix = ( bidx*16+tidx )-cpu_offset_x; - float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ; - - float shiftJ = sin_angle/cos_angle; - float x1 = min(-sin_angle/cos_angle ,0.f); - float x2 = max(-sin_angle/cos_angle ,0.f); - - float Area; - Area=1.0f/cos_angle; - int stlA, stlB , stlAJ, stlBJ ; - stlA=strideLine[16+tidy]; - stlB=strideLine[tidy]; - stlAJ=strideJoseph[16+tidy]; - stlBJ=strideJoseph[tidy]; - - int beginA = beginPos[16+tidy ]; - int beginB = beginPos[tidy ]; - int l; - - int ym, yp, xm, xp; - float yc, xc; - float val; - if(josephnoclip) { - for(int j=0; j<dimslice; j++) { // j: Joseph - x1 = beginA +(posx)*stlA + (j)*stlAJ+1.0f; - x2 = beginB +(posx)*stlB + (j)*stlBJ+1.0f; - /* - Bilinear interpolation - */ - yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped - ym = (int) floor(yc); // y_minus - yp = (int) ceil(yc); // y_plus - - xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f)); // x_clipped - xm = (int) floor(xc); // x_minus - xp = (int) ceil(xc); // x_plus - - if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm]; - else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm)); - else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym)); - else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc)) - + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc)) - + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm)) - + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm)); - // ---------- - res += val; - posx += shiftJ; - } - } - else { - for(int j=0; j<dimslice; j++) { // j: Joseph - x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f; - x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f; - l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ; - /* - Bilinear interpolation - */ - yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped - ym = (int) floor(yc); // y_minus - yp = (int) ceil(yc); // y_plus - - xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f)); // x_clipped - xm = (int) floor(xc); // x_minus - xp = (int) ceil(xc); // x_plus - - if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm]; - else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm)); - else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym)); - else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc)) - + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc)) - + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm)) - + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm)); - // ---------- - res += val*l; - posx += shiftJ; - } - } - - if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) { - res *= Area; - if (normalize) - res *= M_PI_F * 0.5f / num_projections; - d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res; - } -} diff --git a/silx/resources/opencl/sparse.cl b/silx/resources/opencl/sparse.cl deleted file mode 100644 index c99a0e9..0000000 --- a/silx/resources/opencl/sparse.cl +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (C) 2016-2019 European Synchrotron Radiation Facility - * Grenoble, France - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef IMAGE_WIDTH - #error "Please define IMAGE_WIDTH parameter" -#endif - -#ifndef DTYPE - #error "Please define DTYPE parameter" -#endif - -#ifndef IDX_DTYPE - #error "Please define IDX_DTYPE parameter" -#endif - - - -/** - * Densify a matric from CSR format to "dense" 2D format. - * The input CSR data consists in 3 arrays: (data, ind, iptr). - * The output array is a 2D array of dimensions IMAGE_WIDTH * image_height. - * - * data: 1D array containing the nonzero data items. - * ind: 1D array containing the column indices of the nonzero data items. - * iptr: 1D array containing indirection indices, such that range - * [iptr[i], iptr[i+1]-1] of "data" and "ind" contain the relevant data - * of output row "i". - * output: 2D array containing the densified data. - * image_height: height (number of rows) of the output data. -**/ - -kernel void densify_csr( - const global DTYPE* data, - const global IDX_DTYPE* ind, - const global IDX_DTYPE* iptr, - global DTYPE* output, - int image_height -) -{ - uint tid = get_local_id(0); - uint row_idx = get_global_id(1); - if ((tid >= IMAGE_WIDTH) || (row_idx >= image_height)) return; - - local DTYPE line[IMAGE_WIDTH]; - - // Memset - //~ #pragma unroll - for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) { - if (tid+k >= IMAGE_WIDTH) break; - line[tid+k] = 0.0f; - } - barrier(CLK_LOCAL_MEM_FENCE); - - - uint start = iptr[row_idx], end = iptr[row_idx+1]; - //~ #pragma unroll - for (int k = start; k < end; k += get_local_size(0)) { - // Current work group handles one line of the final array - // on the current line, write data[start+tid] at column index ind[start+tid] - if (k+tid < end) - line[ind[k+tid]] = data[k+tid]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // write the current line (shared mem) into the output array (global mem) - //~ #pragma unroll - for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) { - output[row_idx*IMAGE_WIDTH + tid+k] = line[tid+k]; - if (k+tid >= IMAGE_WIDTH) return; - } -} diff --git a/silx/resources/opencl/statistics.cl b/silx/resources/opencl/statistics.cl deleted file mode 100644 index 47d925b..0000000 --- a/silx/resources/opencl/statistics.cl +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Project: Silx statics calculation - * - * - * - * Copyright (C) 2012-2021 European Synchrotron Radiation Facility - * Grenoble, France - * - * Principal authors: J. Kieffer (kieffer@esrf.fr) - * Last revision: 17/05/2021 - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -/** - * \file - * - * \brief OpenCL kernels for min, max, mean and std calculation - * - * This module provides two functions to perform the `map` and the `reduce` - * to be used with pyopencl reduction to calculate in a single pass the minimum, - * maximum, sum, count, mean and standart deviation for an array. - * - * So beside the reduction mechanisme from pyopencl, this algorithm implementes equations from - * https://dbs.ifi.uni-heidelberg.de/files/Team/eschubert/publications/SSDBM18-covariance-authorcopy.pdf - * - * let A and B be 2 disjoint partition of all elements - * - * Omega_A = sum_{i \in A}(omaga_i) The sum of all weights - * V_A is the weighted sum of the signal over the partition - * VV_A is the weighted sum of deviation squarred - * - * With this the mean is V / Omega and the variance equals VV / omega. - * - * Redction operator performs: - * Omega_{AB} = Omega_A + Omega_B - * V_{AB} = V_A + V_B - * VV{AB} = VV_A + VV_B + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB}) - * - * To avoid any numerical degradation, the doubleword library is used to perform all floating point operations. - * - */ -#include "for_eclipse.h" - -/* \brief read a value at given position and initialize the float8 for the reduction - * - * The float8 returned contains: - * s0: minimum value - * s1: maximum value - * s2: Omega_h count number of valid pixels - * s3: Omega_l error associated to the count - * s4: V_h sum of signal - * s5: V_l error associated to the sum of signal - * s6: VVh variance*count - * s7: VVl error associated to variance*count - * - */ -static inline float8 map_statistics(global float* data, int position) -{ - float value = data[position]; - float8 result; - - if (isfinite(value)) - { - result = (float8)(value, value, 1.0f, 0.0f, value, 0.0f, 0.0f, 0.0f); - // min max cnt_h cnt_l sum_h sum_l M2_h M2_l - } - else - { - result = (float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f); - // min max cnt_h cnt_l sum_h sum_l M2_h M2_l - } - return result; -} - -/* \brief reduction function associated to the statistics. - * - * this is described in: - * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm - * - * The float8 used here contain contains: - * s0: minimum value - * s1: maximum value - * s2: count number of valid pixels - * s3: count (error associated to) - * s4: sum of valid pixels - * s5: sum (error associated to) - * s6: M=variance*(count-1) - * s7: M=variance*(count-1) (error associated to) - * - */ - -static inline float8 reduce_statistics(float8 a, float8 b) -{ - float2 sum_a, sum_b, M_a, M_b, count_a, count_b; - float2 count, sum; - float2 M, delta, delta2, omega3; - //test on count - if (a.s2 == 0.0f) - { - return b; - } - else - { - count_a = (float2)(a.s2, a.s3); - sum_a = (float2)(a.s4, a.s5); - M_a = (float2)(a.s6, a.s7); - } - //test on count - if (b.s2 == 0.0f) - { - return a; - } - else - { - count_b = (float2)(b.s2, b.s3); - sum_b = (float2)(b.s4, b.s5); - M_b = (float2)(b.s6, b.s7); - } - // count = count_a + count_b - count = dw_plus_dw(count_a, count_b); - // sum = sum_a + sum_b - sum = dw_plus_dw(sum_a, sum_b); - - // M2 = M_a + M_b + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB}) - M = dw_plus_dw(M_a, M_b); - delta = dw_plus_dw(dw_times_dw(count_b, sum_a), - -dw_times_dw(count_a, sum_b)); - delta2 = dw_times_dw(delta, delta); - omega3 = dw_times_dw(count, dw_times_dw(count_a, count_b)); - M = dw_plus_dw(M, dw_div_dw(delta2, omega3)); - - float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1), - count.s0, count.s1, - sum.s0, sum.s1, - M.s0, M.s1); - return result; -} - -/* \brief reduction function associated to the statistics without compensated arithmetics. - * - * this is described in: - * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm - * - * The float8 used here contain contains: - * s0: minimum value - * s1: maximum value - * s2: count number of valid pixels (major) - * s3: count number of valid pixels (minor) - * s4: sum of valid pixels (major) - * s5: sum of valid pixels (minor) - * s6: variance*count (major) - * s7: variance*count (minor) - * - */ - -static inline float8 reduce_statistics_simple(float8 a, float8 b) -{ - float sum_a, sum_b, M_a, M_b, count_a, count_b; - - //test on count - if (a.s2 == 0.0f) - { - return b; - } - else - { - count_a = a.s2; - sum_a = a.s4; - M_a = a.s6; - } - //test on count - if (b.s2 == 0.0f) - { - return a; - } - else - { - count_b = b.s2; - sum_b = b.s4; - M_b = b.s6; - } - float count = count_a + count_b; - float sum = sum_a + sum_b; - float delta = sum_a*count_b - sum_b*count_a; - float delta2 = delta * delta; - float M2 = M_a + M_b + delta2/(count*count_a*count_b); - float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1), - count, 0.0f, - sum, 0.0f, - M2, 0.0f); - return result; -} - - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - - -// unpack a double in two floats such as the sum of the two is the double number -static inline float2 unpack_double(double inp){ - float major = (float) inp; - float minor = (float) (inp - major); - return (float2)(major, minor); -} - -// pack two floats into a double -static inline double pack_double(float major, float minor){ - return (double)major + (double)minor; -} - -/* \brief reduction function associated to the statistics using double precision arithmetics - * - * this is described in: - * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm - * - * The float8 used here contain contains: - * s0: minimum value - * s1: maximum value - * s2: count number of valid pixels (major) - * s3: count number of valid pixels (minor) - * s4: sum of valid pixels (major) - * s5: sum of valid pixels (minor) - * s6: variance*count (major) - * s7: variance*count (minor) - * - */ - -static inline float8 reduce_statistics_double(float8 a, float8 b) -{ - double sum_a, sum_b, M_a, M_b, count_a, count_b; - - //test on count - if (a.s2 == 0.0) - { - return b; - } - else - { - count_a = pack_double(a.s2, a.s3); - sum_a = pack_double(a.s4,a.s5); - M_a = pack_double(a.s6, a.s7); - } - //test on count - if (b.s2 == 0.0) - { - return a; - } - else - { - count_b = pack_double(b.s2, b.s3); - sum_b = pack_double(b.s4, b.s5); - M_b = pack_double(b.s6, b.s7); - } - double count = count_a + count_b; - double sum = sum_a + sum_b; - double delta = sum_a*count_b - sum_b*count_a; - double delta2 = delta * delta; - double M2 = M_a + M_b + delta2/(count*count_a*count_b); - float8 result = (float8)((float2)(min(a.s0, b.s0), max(a.s1, b.s1)), - unpack_double(count), - unpack_double( sum), - unpack_double( M2)); - return result; -} - -#endif
\ No newline at end of file |