diff options
Diffstat (limited to 'src/silx/resources/opencl')
21 files changed, 4964 insertions, 0 deletions
diff --git a/src/silx/resources/opencl/addition.cl b/src/silx/resources/opencl/addition.cl new file mode 100644 index 0000000..35d7996 --- /dev/null +++ b/src/silx/resources/opencl/addition.cl @@ -0,0 +1,42 @@ +/* + * Project: SIFT: An algorithm for image alignement + * + * Copyright (C) 2013-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +// "Hello_world" kernel to test if OpenCL is actually working +kernel void addition(global float* a, + global float* b, + global float* res, + int N) +{ + int i = get_global_id(0); + if( i<N ){ + res[i] = a[i] + b[i]; + } +} diff --git a/src/silx/resources/opencl/array_utils.cl b/src/silx/resources/opencl/array_utils.cl new file mode 100644 index 0000000..6f78921 --- /dev/null +++ b/src/silx/resources/opencl/array_utils.cl @@ -0,0 +1,73 @@ +/** + * 2D Memcpy for float* arrays, + * replacing pyopencl "enqueue_copy" which does not work for rectangular copies. + * ALL THE SIZES/OFFSETS ARE SPECIFIED IN PIXELS, NOT IN BYTES. + * In the (x, y) convention, x is the fast index (as in CUDA). + * + * :param dst: destination array + * :param src: source array + * :param dst_width: width of the dst array + * :param src_width: width of the src array + * :param dst_offset: tuple with the offset (x, y) in the dst array + * :param src_offset: tuple with the offset (x, y) in the src array + * :param transfer_shape: shape of the transfer array in the form (x, y) + * + */ +kernel void cpy2d( + global float* dst, + global float* src, + int dst_width, + int src_width, + int2 dst_offset, + int2 src_offset, + int2 transfer_shape) +{ + int gidx = get_global_id(0), gidy = get_global_id(1); + if (gidx < transfer_shape.x && gidy < transfer_shape.y) { + dst[(dst_offset.y + gidy)*dst_width + (dst_offset.x + gidx)] = src[(src_offset.y + gidy)*src_width + (src_offset.x + gidx)]; + } +} + + +// Looks like cfloat_t and cfloat_mul are not working, yet specified in +// pyopencl documentation. Here we are using float2 as in all available examples +// #include <pyopencl-complex.h> +// typedef cfloat_t complex; + +static inline float2 complex_mul(float2 a, float2 b) { + float2 res = (float2) (0, 0); + res.x = a.x * b.x - a.y * b.y; + res.y = a.y * b.x + a.x * b.y; + return res; +} + +// arr2D *= arr1D (line by line, i.e along fast dim) +kernel void inplace_complex_mul_2Dby1D( + global float2* arr2D, + global float2* arr1D, + int width, + int height) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if ((x >= width) || (y >= height)) return; + int i = y*width + x; + arr2D[i] = complex_mul(arr2D[i], arr1D[x]); +} + + +// arr3D *= arr1D (along fast dim) +kernel void inplace_complex_mul_3Dby1D( + global float2* arr3D, + global float2* arr1D, + int width, + int height, + int depth) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + if ((x >= width) || (y >= height) || (z >= depth)) return; + int i = (z*height + y)*width + x; + arr3D[i] = complex_mul(arr3D[i], arr1D[x]); +} diff --git a/src/silx/resources/opencl/backproj.cl b/src/silx/resources/opencl/backproj.cl new file mode 100644 index 0000000..da15131 --- /dev/null +++ b/src/silx/resources/opencl/backproj.cl @@ -0,0 +1,232 @@ +/* + * Project: silx: filtered backprojection + * + * Copyright (C) 2016-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: A. Mirone + * P. Paleo + * + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +/*******************************************************************************/ +/************************ GPU VERSION (with textures) **************************/ +/*******************************************************************************/ + +#ifndef DONT_USE_TEXTURES +kernel void backproj_kernel( + int num_proj, + int num_bins, + float axis_position, + global float *d_SLICE, + read_only image2d_t d_sino, + float gpu_offset_x, + float gpu_offset_y, + global float * d_cos_s, // precalculated cos(theta[i]) + global float* d_sin_s, // precalculated sin(theta[i]) + global float* d_axis_s, // array of axis positions (n_projs) + local float* shared2) // 768B of local mem +{ + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; + const int tidx = get_local_id(0); //threadIdx.x; + const int bidx = get_group_id(0); //blockIdx.x; + const int tidy = get_local_id(1); //threadIdx.y; + const int bidy = get_group_id(1); //blockIdx.y; + + local float sh_cos[256]; + local float sh_sin[256]; + local float sh_axis[256]; + + float pcos, psin; + float h0, h1, h2, h3; + const float apos_off_x= gpu_offset_x - axis_position ; + const float apos_off_y= gpu_offset_y - axis_position ; + float acorr05; + float res0 = 0, res1 = 0, res2 = 0, res3 = 0; + + const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ; + const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ; + + int read=0; + for(int proj=0; proj<num_proj; proj++) { + if(proj>=read) { + barrier(CLK_LOCAL_MEM_FENCE); + int ip = tidy*16+tidx; + if( read+ip < num_proj) { + sh_cos [ip] = d_cos_s[read+ip] ; + sh_sin [ip] = d_sin_s[read+ip] ; + sh_axis[ip] = d_axis_s[read+ip] ; + } + read=read+256; // 256=16*16 block size + barrier(CLK_LOCAL_MEM_FENCE); + } + pcos = sh_cos[256-read + proj] ; + psin = sh_sin[256-read + proj] ; + + acorr05 = sh_axis[256 - read + proj] ; + + h0 = (acorr05 + bx00*pcos - by00*psin); + h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin); + h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin); + h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin); + + if(h0>=0 && h0<num_bins) res0 += read_imagef(d_sino, sampler, (float2) (h0 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h0 +0.5f,proj +0.5f); + if(h1>=0 && h1<num_bins) res1 += read_imagef(d_sino, sampler, (float2) (h1 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h1 +0.5f,proj +0.5f); + if(h2>=0 && h2<num_bins) res2 += read_imagef(d_sino, sampler, (float2) (h2 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h2 +0.5f,proj +0.5f); + if(h3>=0 && h3<num_bins) res3 += read_imagef(d_sino, sampler, (float2) (h3 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h3 +0.5f,proj +0.5f); + } + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3; +} +#endif + + + + +/*******************************************************************************/ +/********************* CPU VERSION (without textures) **************************/ +/*******************************************************************************/ + + +#define CLIP_MAX(x, N) (fmin(fmax(x, 0.0f), (N - 1.0f))) + +#define FLOORCEIL_x(x) {\ + xm = (int) floor(x);\ + xp = (int) ceil(x);\ +} + +#define ADJACENT_PIXELS_VALS(arr, Nx, y, xm, xp) ((float2) (arr[y*Nx+xm], arr[y*Nx+xp])) + +//Simple linear interpolator for working on the GPU +static float linear_interpolation(float2 vals, + float x, + int xm, + int xp) +{ + if (xm == xp) + return vals.s0; + else + return (vals.s0 * (xp - x)) + (vals.s1 * (x - xm)); +} + +/** + * + * Same kernel as backproj_kernel, but targets the CPU (no texture) + * +**/ +kernel void backproj_cpu_kernel( + int num_proj, + int num_bins, + float axis_position, + global float *d_SLICE, + global float* d_sino, + float gpu_offset_x, + float gpu_offset_y, + global float * d_cos_s, // precalculated cos(theta[i]) + global float * d_sin_s, // precalculated sin(theta[i]) + global float * d_axis_s, // array of axis positions (n_projs) + local float* shared2) // 768B of local mem +{ + const int tidx = get_local_id(0); //threadIdx.x; + const int bidx = get_group_id(0); //blockIdx.x; + const int tidy = get_local_id(1); //threadIdx.y; + const int bidy = get_group_id(1); //blockIdx.y; + + local float sh_cos[256]; + local float sh_sin[256]; + local float sh_axis[256]; + + float pcos, psin; + float h0, h1, h2, h3; + const float apos_off_x= gpu_offset_x - axis_position ; + const float apos_off_y= gpu_offset_y - axis_position ; + float acorr05; + float res0 = 0, res1 = 0, res2 = 0, res3 = 0; + + const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ; + const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ; + + int read=0; + for(int proj=0; proj<num_proj; proj++) { + if(proj>=read) { + barrier(CLK_LOCAL_MEM_FENCE); + int ip = tidy*16+tidx; + if( read+ip < num_proj) { + sh_cos [ip] = d_cos_s[read+ip] ; + sh_sin [ip] = d_sin_s[read+ip] ; + sh_axis[ip] = d_axis_s[read+ip] ; + } + read=read+256; // 256=16*16 block size + barrier(CLK_LOCAL_MEM_FENCE); + } + pcos = sh_cos[256-read + proj] ; + psin = sh_sin[256-read + proj] ; + + acorr05 = sh_axis[256 - read + proj] ; + + h0 = (acorr05 + bx00*pcos - by00*psin); + h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin); + h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin); + h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin); + + + float x; + int ym, xm, xp; + ym = proj; + float2 vals; + + if(h0>=0 && h0<num_bins) { + x = CLIP_MAX(h0, num_bins); + FLOORCEIL_x(x); + vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); + res0 += linear_interpolation(vals, x, xm, xp); + } + if(h1>=0 && h1<num_bins) { + x = CLIP_MAX(h1, num_bins); + FLOORCEIL_x(x); + vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); + res1 += linear_interpolation(vals, x, xm, xp); + } + if(h2>=0 && h2<num_bins) { + x = CLIP_MAX(h2, num_bins); + FLOORCEIL_x(x); + vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); + res2 += linear_interpolation(vals, x, xm, xp); + } + if(h3>=0 && h3<num_bins) { + x = CLIP_MAX(h3, num_bins); + FLOORCEIL_x(x); + vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp); + res3 += linear_interpolation(vals, x, xm, xp); + } + } + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2; + d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3; +} + diff --git a/src/silx/resources/opencl/backproj_helper.cl b/src/silx/resources/opencl/backproj_helper.cl new file mode 100644 index 0000000..b1590f8 --- /dev/null +++ b/src/silx/resources/opencl/backproj_helper.cl @@ -0,0 +1,68 @@ +/* + * Project: silx: backprojection helper functions + * + * Copyright (C) 2016-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: P. Paleo + * J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +kernel void mult( global float2* d_sino, + global float2* d_filter, + int num_bins, + int num_projs) +{ + int gid0 = get_global_id(0); + int gid1 = get_global_id(1); + if (gid0 < num_bins && gid1 < num_projs) + { + // d_sino[gid1*num_bins+gid0] *= d_filter[gid0]; + d_sino[gid1*num_bins+gid0].x *= d_filter[gid0].x; + d_sino[gid1*num_bins+gid0].y *= d_filter[gid0].x; + } +} + +// copy only the real part of the valid data to the real array +kernel void cpy2d_c2r( + global float* d_sino, + global float2* d_sino_complex, + int num_bins, + int num_projs, + int fft_size) +{ + int gid0 = get_global_id(0); + int gid1 = get_global_id(1); + if (gid0 < num_bins && gid1 < num_projs) { + d_sino[gid1*num_bins+gid0] = d_sino_complex[gid1*fft_size+gid0].x; + } +} + + + + + + + diff --git a/src/silx/resources/opencl/bitonic.cl b/src/silx/resources/opencl/bitonic.cl new file mode 100644 index 0000000..4096ce8 --- /dev/null +++ b/src/silx/resources/opencl/bitonic.cl @@ -0,0 +1,569 @@ +/*############################################################################ +# Sort elements within a vector by Matthew Scarpino, +# Taken from his book "OpenCL in Action", +# November 2011 ISBN 9781617290176 +# Original license for the code: "public domain" +# +# Originally this code is public domain. The MIT license has been added +# by J. Kieffer (jerome.kieffer@esrf.eu) to provide a disclaimer. +# J. Kieffer does not claim authorship of this code developed by . +# +# Copyright (c) 2011 Matthew Scarpino +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +#############################################################################*/ + +#define VECTOR_SORT_BOOK(input, dir) \ + comp = abs(input > shuffle(input, mask2)) ^ dir; \ + input = shuffle(input, comp * 2 + add2); \ + comp = abs(input > shuffle(input, mask1)) ^ dir; \ + input = shuffle(input, comp + add1); \ + + +#define VECTOR_SWAP_BOOK(in1, in2, dir) \ + input1 = in1; input2 = in2; \ + comp = (abs(input1 > input2) ^ dir) * 4 + add3; \ + in1 = shuffle2(input1, input2, comp); \ + in2 = shuffle2(input2, input1, comp); \ + + +// The _FILE extension correspond to the formula found in the "OpenCL in Action" supplementary files +#define VECTOR_SORT_FILE(input, dir)\ + comp = (input < shuffle(input, mask2)) ^ dir;\ + input = shuffle(input, as_uint4(comp * 2 + add2));\ + comp = (input < shuffle(input, mask1)) ^ dir;\ + input = shuffle(input, as_uint4(comp + add1));\ + + +#define VECTOR_SWAP_FILE(input1, input2, dir)\ + temp = input1;\ + comp = ((input1 < input2) ^ dir) * 4 + add3;\ + input1 = shuffle2(input1, input2, as_uint4(comp));\ + input2 = shuffle2(input2, temp, as_uint4(comp));\ + + + +// Functions to be called from an actual kernel. + +static float8 my_sort_file(uint local_id, uint group_id, uint local_size, + float8 input, local float4 *l_data) +{ + float4 input1, input2, temp; + float8 output; + + int dir; + uint id, size, stride; + int4 comp; + + uint4 mask1 = (uint4)(1, 0, 3, 2); + uint4 mask2 = (uint4)(2, 3, 0, 1); + uint4 mask3 = (uint4)(3, 2, 1, 0); + + int4 add1 = (int4)(1, 1, 3, 3); + int4 add2 = (int4)(2, 3, 2, 3); + int4 add3 = (int4)(1, 2, 2, 3); + + // retrieve input data + input1 = (float4)(input.s0, input.s1, input.s2, input.s3); + input2 = (float4)(input.s4, input.s5, input.s6, input.s7); + + // Find global address + id = local_id * 2; + + /* Sort input 1 - ascending */ + comp = input1 < shuffle(input1, mask1); + input1 = shuffle(input1, as_uint4(comp + add1)); + comp = input1 < shuffle(input1, mask2); + input1 = shuffle(input1, as_uint4(comp * 2 + add2)); + comp = input1 < shuffle(input1, mask3); + input1 = shuffle(input1, as_uint4(comp + add3)); + + /* Sort input 2 - descending */ + comp = input2 > shuffle(input2, mask1); + input2 = shuffle(input2, as_uint4(comp + add1)); + comp = input2 > shuffle(input2, mask2); + input2 = shuffle(input2, as_uint4(comp * 2 + add2)); + comp = input2 > shuffle(input2, mask3); + input2 = shuffle(input2, as_uint4(comp + add3)); + + /* Swap corresponding elements of input 1 and 2 */ + add3 = (int4)(4, 5, 6, 7); + dir = - (int) (local_id % 2); + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + + /* Sort data and store in local memory */ + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + barrier(CLK_LOCAL_MEM_FENCE); + + /* Create bitonic set */ + for(size = 2; size < local_size; size <<= 1) { + dir = - (int) (local_id/size & 1) ; + + for(stride = size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + barrier(CLK_LOCAL_MEM_FENCE); + } + + /* Perform bitonic merge */ + dir = - (int) (group_id % 2); + for(stride = local_size; stride > 1; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Perform final sort */ + id = local_id * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + + // setup output and return it + output = (float8)(input1, input2); + return output; +} + +static float8 my_sort_book(uint local_id, uint group_id, uint local_size, + float8 input, local float4 *l_data) +{ + float4 input1, input2, temp; + float8 output; + uint4 comp, swap, mask1, mask2, add1, add2, add3; + uint id, dir, size, stride; + mask1 = (uint4)(1, 0, 3, 2); + swap = (uint4)(0, 0, 1, 1); + add1 = (uint4)(0, 0, 2, 2); + mask2 = (uint4)(2, 3, 0, 1); + add2 = (uint4)(0, 1, 0, 1); + add3 = (uint4)(0, 1, 2, 3); + + // retrieve input data + input1 = (float4)(input.s0, input.s1, input.s2, input.s3); + input2 = (float4)(input.s4, input.s5, input.s6, input.s7); + + // Find global address + id = local_id * 2; + + //Sort first vector + + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp ^ swap + add1); + comp = abs(input1 > shuffle(input1, mask2)); + input1 = shuffle(input1, comp * 2 + add2); + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp + add1); + + //Sort second vector + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp ^ swap + add1); + comp = abs(input2 < shuffle(input2, mask2)); + input2 = shuffle(input2, comp * 2 + add2); + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp + add1); + + // Swap elements + dir = local_id % 2; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + + // Perform upper stages + for(size = 2; size < local_size; size <<= 1) + { + dir = local_id/size & 1; + + //Perform lower stages + for(stride = size; stride > 1; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = local_id * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + } + dir = group_id % 2; + + // Perform bitonic merge + for(stride = local_size; stride > 1; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = local_id * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + + // setup output and return it + output = (float8)(input1, input2); + return output; +} + + + +////////////// +// Kernels +////////////// + +// Perform the sort on the whole array +// dim0: wg=number_of_element/8 + +kernel void bsort_all(global float4 *g_data, + local float4 *l_data) +{ + float4 input1, input2; + float8 input, output; + uint id, global_start; + // Find global address + id = get_local_id(0) * 2; + global_start = get_group_id(0) * get_local_size(0) * 2 + id; + + input1 = g_data[global_start]; + input2 = g_data[global_start+1]; + input = (float8)(input1, input2); + output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), + input, l_data); + input1 = (float4) (output.s0, output.s1, output.s2, output.s3); + input2 = (float4) (output.s4, output.s5, output.s6, output.s7); + g_data[global_start] = input1; + g_data[global_start+1] = input2; +} + + +// Perform the sort along the horizontal axis of a 2D image +// dim0 = y: wg=1 +// dim1 = x: wg=number_of_element/8 +kernel void bsort_horizontal(global float *g_data, + local float4 *l_data) +{ + float8 input, output; + uint id, global_start, offset; + + // Find global address + offset = get_global_size(1)*get_global_id(0)*8; + id = get_local_id(1) * 8; + global_start = offset + get_group_id(1) * get_local_size(1) * 8 + id; + + input = (float8)(g_data[global_start ], + g_data[global_start + 1], + g_data[global_start + 2], + g_data[global_start + 3], + g_data[global_start + 4], + g_data[global_start + 5], + g_data[global_start + 6], + g_data[global_start + 7]); + + output = my_sort_file(get_local_id(1), get_group_id(1), get_local_size(1), + input, l_data); + + g_data[global_start ] = output.s0; + g_data[global_start + 1] = output.s1; + g_data[global_start + 2] = output.s2; + g_data[global_start + 3] = output.s3; + g_data[global_start + 4] = output.s4; + g_data[global_start + 5] = output.s5; + g_data[global_start + 6] = output.s6; + g_data[global_start + 7] = output.s7; +} + + +// Perform the sort along the vertical axis +// dim0 = y: wg=number_of_element/8 +// dim1 = x: wg=1 +// check if transposing +bsort_horizontal is not more efficient ? + +kernel void bsort_vertical(global float *g_data, + local float4 *l_data) +{ + // we need to read 8 float position along the vertical axis + float8 input, output; + uint id, global_start, padding; + + // Find global address + padding = get_global_size(1); + id = get_local_id(0) * 8 * padding + get_global_id(1); + global_start = get_group_id(0) * get_local_size(0) * 8 * padding + id; + + input = (float8)(g_data[global_start ], + g_data[global_start + padding ], + g_data[global_start + 2*padding], + g_data[global_start + 3*padding], + g_data[global_start + 4*padding], + g_data[global_start + 5*padding], + g_data[global_start + 6*padding], + g_data[global_start + 7*padding]); + + output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), + input, l_data); + g_data[global_start ] = output.s0; + g_data[global_start + padding ] = output.s1; + g_data[global_start + 2*padding ] = output.s2; + g_data[global_start + 3*padding ] = output.s3; + g_data[global_start + 4*padding ] = output.s4; + g_data[global_start + 5*padding ] = output.s5; + g_data[global_start + 6*padding ] = output.s6; + g_data[global_start + 7*padding ] = output.s7; +} + + +//Tested working reference kernel frm the book. This only works under Linux +kernel void bsort_book(global float4 *g_data, + local float4 *l_data) { + float4 input1, input2, temp; + uint4 comp, swap, mask1, mask2, add1, add2, add3; + uint id, dir, global_start, size, stride; + mask1 = (uint4)(1, 0, 3, 2); + swap = (uint4)(0, 0, 1, 1); + add1 = (uint4)(0, 0, 2, 2); + mask2 = (uint4)(2, 3, 0, 1); + add2 = (uint4)(0, 1, 0, 1); + add3 = (uint4)(0, 1, 2, 3); + + // Find global address + id = get_local_id(0) * 2; + global_start = get_group_id(0) * get_local_size(0) * 2 + id; + + //Sort first vector + input1 = g_data[global_start]; + input2 = g_data[global_start+1]; + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp ^ swap + add1); + comp = abs(input1 > shuffle(input1, mask2)); + input1 = shuffle(input1, comp * 2 + add2); + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp + add1); + + //Sort second vector + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp ^ swap + add1); + comp = abs(input2 < shuffle(input2, mask2)); + input2 = shuffle(input2, comp * 2 + add2); + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp + add1); + + // Swap elements + dir = get_local_id(0) % 2; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + + // Perform upper stages + for(size = 2; size < get_local_size(0); size <<= 1) { + dir = get_local_id(0)/size & 1; + + //Perform lower stages + for(stride = size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = get_local_id(0) * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + } + dir = get_group_id(0) % 2; + // Perform bitonic merge + for(stride = get_local_size(0); stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = get_local_id(0) * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + g_data[global_start] = input1; + g_data[global_start+1] = input2; + } + +//Tested working reference kernel from the addition files. This only works under any operating system +/* Perform initial sort */ +kernel void bsort_file(global float4 *g_data, local float4 *l_data) { + + int dir; + uint id, global_start, size, stride; + float4 input1, input2, temp; + int4 comp; + + uint4 mask1 = (uint4)(1, 0, 3, 2); + uint4 mask2 = (uint4)(2, 3, 0, 1); + uint4 mask3 = (uint4)(3, 2, 1, 0); + + int4 add1 = (int4)(1, 1, 3, 3); + int4 add2 = (int4)(2, 3, 2, 3); + int4 add3 = (int4)(1, 2, 2, 3); + + id = get_local_id(0) * 2; + global_start = get_group_id(0) * get_local_size(0) * 2 + id; + + input1 = g_data[global_start]; + input2 = g_data[global_start+1]; + + /* Sort input 1 - ascending */ + comp = input1 < shuffle(input1, mask1); + input1 = shuffle(input1, as_uint4(comp + add1)); + comp = input1 < shuffle(input1, mask2); + input1 = shuffle(input1, as_uint4(comp * 2 + add2)); + comp = input1 < shuffle(input1, mask3); + input1 = shuffle(input1, as_uint4(comp + add3)); + + /* Sort input 2 - descending */ + comp = input2 > shuffle(input2, mask1); + input2 = shuffle(input2, as_uint4(comp + add1)); + comp = input2 > shuffle(input2, mask2); + input2 = shuffle(input2, as_uint4(comp * 2 + add2)); + comp = input2 > shuffle(input2, mask3); + input2 = shuffle(input2, as_uint4(comp + add3)); + + /* Swap corresponding elements of input 1 and 2 */ + add3 = (int4)(4, 5, 6, 7); + dir = - (int)(get_local_id(0) % 2); + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + + /* Sort data and store in local memory */ + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + + /* Create bitonic set */ + for(size = 2; size < get_local_size(0); size <<= 1) { + dir = - (int)(get_local_id(0)/size & 1); + + for(stride = size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + } + + /* Perform bitonic merge */ + dir = - (int)(get_group_id(0) % 2); + for(stride = get_local_size(0); stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Perform final sort */ + id = get_local_id(0) * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + g_data[global_start] = input1; + g_data[global_start+1] = input2; +} + diff --git a/src/silx/resources/opencl/codec/bitshuffle_lz4.cl b/src/silx/resources/opencl/codec/bitshuffle_lz4.cl new file mode 100644 index 0000000..71f617a --- /dev/null +++ b/src/silx/resources/opencl/codec/bitshuffle_lz4.cl @@ -0,0 +1,625 @@ +/* + * Project: SILX: Bitshuffle LZ4 decompressor + * + * Copyright (C) 2022 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* To decompress bitshuffle-LZ4 data in parallel on GPU one needs to: + * - Find all begining of blocks, this is performed by the ... kernel. + * - Decompress each block by one workgroup. + * - bitshuffle the data from one workgroup + */ + +#ifndef LZ4_BLOCK_SIZE +# define LZ4_BLOCK_SIZE 8192 +#endif +#define LZ4_BLOCK_EXTRA 400 +#ifdef __ENDIAN_LITTLE__ +#define SWAP_BE 1 +#define SWAP_LE 0 +#else +#define SWAP_BE 0 +#define SWAP_LE 1 +#endif + + +#define int8_t char +#define uint8_t uchar +#define int16_t short +#define uint16_t ushort +#define int32_t int +#define uint32_t uint +#define int64_t long +#define uint64_t ulong + +#define position_t uint +#define token_t uchar2 + +//Some function used as part of bitshuffle: + +inline token_t decode_token(uint8_t value){ + return (token_t)(value >> 4, // literals + value & 0x0f); // matches +} + +inline bool has_liter_over(token_t token) +{ + return token.s0 >= 15; +} + +inline bool has_match_over(token_t token) +{ + return token.s1 >= 15; +} + +//parse overflow, return the number of overflow and the new position +inline uint2 read_overflow(local uint8_t* buffer, + position_t buffer_size, + position_t idx){ + position_t num = 0; + uint8_t next = 0xff; + while (next == 0xff && idx < buffer_size){ + next = buffer[idx]; + idx += 1; + num += next; + } + return (uint2)(num, idx); +} + +inline void copy_no_overlap(local uint8_t* dest, + const position_t dest_position, + local uint8_t* source, + const position_t src_position, + const position_t length){ + for (position_t i=get_local_id(0); i<length; i+=get_local_size(0)) { + dest[dest_position+i] = source[src_position+i]; + } +} + +inline void copy_repeat(local uint8_t* dest, + const position_t dest_position, + local uint8_t* source, + const position_t src_position, + const position_t dist, + const position_t length){ + + // if there is overlap, it means we repeat, so we just + // need to organize our copy around that + for (position_t i=get_local_id(0); i<length; i+=get_local_size(0)) { + dest[dest_position+i] = source[src_position + i%dist]; + } +} + +inline void copy_collab(local uint8_t* dest, + const position_t dest_position, + local uint8_t* source, + const position_t src_position, + const position_t dist, + const position_t length){ + //Generic copy function + if (dist < length) { + copy_repeat(dest, dest_position, source, src_position, dist, length); + } + else { + copy_no_overlap(dest, dest_position, source, src_position, length); + } +} + +// Function to read larger integers at various position. Endianness is addressed as well with the swap flag +uint64_t load64_at(global uint8_t *src, + const uint64_t position, + const bool swap){ + uchar8 vector; + if (swap){ + vector = (uchar8)(src[position+7],src[position+6], + src[position+5],src[position+4], + src[position+3],src[position+2], + src[position+1],src[position+0]); + } + else{ + vector = (uchar8)(src[position+0],src[position+1], + src[position+2],src[position+3], + src[position+4],src[position+5], + src[position+6],src[position+7]); + } + return as_ulong(vector); +} + +uint32_t load32_at(global uint8_t *src, + const uint64_t position, + const bool swap){ + uchar4 vector; + if (swap){ + vector = (uchar4)( + src[position+3],src[position+2], + src[position+1],src[position+0]); + } + else{ + vector = (uchar4)(src[position+0],src[position+1], + src[position+2],src[position+3]); + } + return as_uint(vector); +} + +uint16_t load16_at(local uint8_t *src, + const uint64_t position, + const bool swap){ + uchar2 vector; + if (swap){ + vector = (uchar2)(src[position+1],src[position+0]); + } + else{ + vector = (uchar2)(src[position+0],src[position+1]); + } + return as_ushort(vector); +} + +//Calculate the begining and the end of the block corresponding to the block=gid +inline void _lz4_unblock(global uint8_t *src, + const uint64_t size, + local uint64_t *block_position){ + uint32_t gid = get_group_id(0); + uint32_t lid = get_local_id(0); + if (lid == 0){ + uint64_t block_start=16; + uint32_t block_size = load32_at(src, 12, SWAP_BE); + uint64_t block_end = block_start + block_size; + + for (uint32_t block_idx=0; block_idx<gid; block_idx++){ + // printf("gid %u idx %u %lu-%lu\n",gid, block_idx,block_start,block_end); + block_start = block_end + 4; + if (block_start>=size){ + printf("Read beyond end of source buffer at gid %u %lu>%lu\n",gid, block_start, size); + block_start = 0; + block_end = 0; + break; + } + block_size = load32_at(src, block_end, SWAP_BE); + block_end = block_start + block_size; + } + block_position[0] = block_start; + block_position[1] = block_end; +// if (gid>get_num_groups(0)-10) printf("Success finish unblock gid %u block: %lu - %lu\n",gid,block_start,block_end); + } + barrier(CLK_LOCAL_MEM_FENCE); +} + + +//Decompress one block in shared memory +inline uint32_t lz4_decompress_local_block( local uint8_t* local_cmp, + local uint8_t* local_dec, + const uint32_t cmp_buffer_size, + const uint32_t dec_buffer_size){ + + uint32_t gid = get_group_id(0); // One block is decompressed by one workgroup + uint32_t lid = get_local_id(0); // This is the thread position in the group... + uint32_t wg = get_local_size(0); // workgroup size + + position_t dec_idx = 0; + position_t cmp_idx = 0; + while (cmp_idx < cmp_buffer_size) { + // read header byte + token_t tok = decode_token(local_cmp[cmp_idx]); + // if (lid==0) printf("gid %u at idx %u/%u. Token is litterials: %u; matches: %u\n", gid, cmp_idx, cmp_buffer_size,tok.s0, tok.s1); + + cmp_idx+=1; + + // read the length of the literals + position_t num_literals = tok.s0; + if (has_liter_over(tok)) { + uint2 tmp = read_overflow(local_cmp, + cmp_buffer_size, + cmp_idx); + num_literals += tmp.s0; + cmp_idx = tmp.s1; + } + const position_t start_literal = cmp_idx; + + // copy the literals to the dst stream in parallel + // if (lid==0) printf("gid %u: copy literals from %u to %u <%u (len %u)\n", gid, cmp_idx,num_literals+cmp_idx,cmp_buffer_size,num_literals); + copy_no_overlap(local_dec, dec_idx, local_cmp, cmp_idx, num_literals); + cmp_idx += num_literals; + dec_idx += num_literals; + + // Note that the last sequence stops right after literals field. + // There are specific parsing rules to respect to be compatible with the + // reference decoder : 1) The last 5 bytes are always literals 2) The last + // match cannot start within the last 12 bytes Consequently, a file with + // less then 13 bytes can only be represented as literals These rules are in + // place to benefit speed and ensure buffer limits are never crossed. + if (cmp_idx < cmp_buffer_size) { + + // read the offset + uint16_t offset = load16_at(local_cmp, cmp_idx, SWAP_LE); + // if (lid==0) printf("gid %u: offset is %u at %u\n",gid, offset, cmp_idx); + if (offset == 0) { + //corruped block + if (lid == 0) + printf("Corrupted block #%u\n", gid); + return 0; + } + + cmp_idx += 2; + + // read the match length + position_t match = 4 + tok.s1; + if (has_match_over(tok)) { + uint2 tmp = read_overflow(local_cmp, + cmp_buffer_size, + cmp_idx); + match += tmp.s0; + cmp_idx = tmp.s1; + } + + //syncronize threads before reading shared memory + barrier(CLK_LOCAL_MEM_FENCE); + + // copy match + copy_collab(local_dec, dec_idx, local_dec, dec_idx - offset, offset, match); + dec_idx += match; + } + } + //syncronize threads before reading shared memory + barrier(CLK_LOCAL_MEM_FENCE); + return dec_idx; +} + +//Perform the bifshuffling on 8-bits objects +inline void bitunshuffle8( local uint8_t* inp, + local uint8_t* out, + const uint32_t buffer_size){ //8k ... or less. +// uint32_t gid = get_group_id(0); + uint32_t lid = get_local_id(0); + uint32_t wg = get_local_size(0); + uint32_t u8_buffer_size = buffer_size; // /1 -> 8k + + // One thread deals with one or several output data + for (uint32_t dpos=lid; dpos<u8_buffer_size; dpos+=wg){ + uint8_t res = 0; + // read bits at several places... + for (uint32_t bit=0; bit<8; bit++){ + uint32_t read_bit = bit*u8_buffer_size + dpos; + uint32_t u8_word_pos = read_bit>>3; // /8 + uint32_t u8_bit_pos = read_bit&7; // %8 + // if (lid==0) printf("dpos %u bit %u read at %u,%u\n",dpos,bit,u8_word_pos,u8_bit_pos); + res |= ((inp[u8_word_pos]>>u8_bit_pos) & 1)<<bit; + } + // if (lid==0) printf("dpos %u res %u\n",dpos,res); + out[dpos] = res; + } +} + + +//Perform the bifshuffling on 16-bits objects +inline void bitunshuffle16( local uint8_t* inp, + local uint8_t* out, + const uint32_t buffer_size){ //8k ... or less. +// uint32_t gid = get_group_id(0); + uint32_t lid = get_local_id(0); + uint32_t wg = get_local_size(0); + uint32_t u16_buffer_size = buffer_size>>1; // /2 -> 4k + + // One thread deals with one or several output data + for (uint32_t dpos=lid; dpos<u16_buffer_size; dpos+=wg){ + uint16_t res = 0; + // read bits at several places... + for (uint32_t bit=0; bit<16; bit++){ + uint32_t read_bit = bit*u16_buffer_size + dpos; + uint32_t u8_word_pos = read_bit>>3; // /8 + uint32_t u8_bit_pos = read_bit&7; // %8 + // if (lid==0) printf("dpos %u bit %u read at %u,%u\n",dpos,bit,u8_word_pos,u8_bit_pos); + res |= ((inp[u8_word_pos]>>u8_bit_pos) & 1)<<bit; + } + // if (lid==0) printf("dpos %u res %u\n",dpos,res); + uchar2 tmp = as_uchar2(res); + out[2*dpos] = tmp.s0; + out[2*dpos+1] = tmp.s1; + } +} + + +//Perform the bifshuffling on 32-bits objects +inline void bitunshuffle32( local uint8_t* inp, + local uint8_t* out, + const uint32_t buffer_size){ //8k ... or less. +// uint32_t gid = get_group_id(0); + uint32_t lid = get_local_id(0); + uint32_t wg = get_local_size(0); + uint32_t u32_buffer_size = buffer_size>>2; // /4 -> 2k + + // One thread deals with one or several output data + for (uint32_t dpos=lid; dpos<u32_buffer_size; dpos+=wg){ + uint32_t res = 0; + // read bits at several places... + for (uint32_t bit=0; bit<32; bit++){ + uint32_t read_bit = bit*u32_buffer_size + dpos; + uint32_t u8_word_pos = read_bit>>3; // /8 + uint32_t u8_bit_pos = read_bit&7; // %8 + // if (lid==0) printf("dpos %u bit %u read at %u,%u\n",dpos,bit,u8_word_pos,u8_bit_pos); + res |= ((inp[u8_word_pos]>>u8_bit_pos) & 1)<<bit; + } + // if (lid==0) printf("dpos %u res %u\n",dpos,res); + uchar4 tmp = as_uchar4(res); + out[4*dpos] = tmp.s0; + out[4*dpos+1] = tmp.s1; + out[4*dpos+2] = tmp.s2; + out[4*dpos+3] = tmp.s3; + } +} + +//Perform the bifshuffling on 32-bits objects +inline void bitunshuffle64( local uint8_t* inp, + local uint8_t* out, + const uint32_t buffer_size){ //8k ... or less. +// uint32_t gid = get_group_id(0); + uint32_t lid = get_local_id(0); + uint32_t wg = get_local_size(0); + uint32_t u64_buffer_size = buffer_size>>3; // /8 -> 1k + + // One thread deals with one or several output data + for (uint32_t dpos=lid; dpos<u64_buffer_size; dpos+=wg){ + uint64_t res = 0; + // read bits at several places... + for (uint32_t bit=0; bit<64; bit++){ + uint32_t read_bit = bit*u64_buffer_size + dpos; + uint32_t u8_word_pos = read_bit>>3; // /8 + uint32_t u8_bit_pos = read_bit&7; // %8 + // if (lid==0) printf("dpos %u bit %u read at %u,%u\n",dpos,bit,u8_word_pos,u8_bit_pos); + res |= ((inp[u8_word_pos]>>u8_bit_pos) & 1)<<bit; + } + // if (lid==0) printf("dpos %u res %u\n",dpos,res); + uchar8 tmp = as_uchar8(res); + out[8*dpos] = tmp.s0; + out[8*dpos+1] = tmp.s1; + out[8*dpos+2] = tmp.s2; + out[8*dpos+3] = tmp.s3; + out[8*dpos+4] = tmp.s4; + out[8*dpos+5] = tmp.s5; + out[8*dpos+6] = tmp.s6; + out[8*dpos+7] = tmp.s7; + } +} + + +/* Preprocessing kernel which performs: +- Memset arrays +- read block position stored in block_position array + +Param: +- src: input buffer in global memory +- size: input buffer size +- block_position: output buffer in local memory containing the index of the begining of each block +- max_blocks: allocated memory for block_position array (output) +- nb_blocks: output buffer with the actual number of blocks in src (output). + +Return: Nothing, this is a kernel + +Hint on workgroup size: little kernel ... wg=1, 1 wg is enough. +*/ + +kernel void lz4_unblock(global uint8_t *src, + const uint64_t size, + global uint64_t *block_start, + const uint32_t max_blocks, + global uint32_t *nb_blocks){ + + uint64_t total_nbytes = load64_at(src,0,SWAP_BE); + uint32_t block_nbytes = load32_at(src,8,SWAP_BE); + + uint32_t block_idx = 0; + uint64_t pos = 12; + uint32_t block_size; + + while ((pos+4<size) && (block_idx<max_blocks)){ + block_size = load32_at(src, pos, SWAP_BE); + block_start[block_idx] = pos + 4; + block_idx +=1; + pos += 4 + block_size; + } + nb_blocks[0] = block_idx; +} + +// decompress a frame blockwise. +// Needs the block position to be known in advance (block_start) calculated from lz4_unblock. +// one workgroup treats on block. + +kernel void bslz4_decompress_block( global uint8_t* comp_src, + global uint8_t* dec_dest, + global uint64_t* block_start, + global uint32_t *nb_blocks, + const uint8_t item_size){ + + uint32_t gid = get_group_id(0); // One block is decompressed by one workgroup + uint32_t lid = get_local_id(0); // This is the thread position in the group... + uint32_t wg = get_local_size(0); // workgroup size + + //guard if the number of wg scheduled is too large + if (gid >=nb_blocks[0]) return; + + // No need to guard, the number of blocks can be calculated in advance. + uint64_t start_read = block_start[gid]; + if (start_read<12) return; + + local uint8_t local_cmp[LZ4_BLOCK_SIZE+LZ4_BLOCK_EXTRA]; + local uint8_t local_dec[LZ4_BLOCK_SIZE]; + + uint32_t cmp_buffer_size = load32_at(comp_src, start_read-4, SWAP_BE); + uint64_t end_read = start_read + cmp_buffer_size; + // Copy locally the compressed buffer and memset the destination buffer + for (uint32_t i=lid; i<cmp_buffer_size; i+=wg){ + uint64_t read_pos = start_read + i; + if (read_pos<end_read) + local_cmp[i] = comp_src[read_pos]; + else + local_cmp[i] = 0; + } + for (uint32_t i=lid+cmp_buffer_size; i<LZ4_BLOCK_SIZE+LZ4_BLOCK_EXTRA; i+=wg){ + local_cmp[i] = 0; + } + for (uint32_t i=lid; i<LZ4_BLOCK_SIZE; i+=wg){ + local_dec[i] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + //All the work is performed here: + uint32_t dec_size = lz4_decompress_local_block( local_cmp, local_dec, cmp_buffer_size, LZ4_BLOCK_SIZE); + + barrier(CLK_LOCAL_MEM_FENCE); + local uint8_t* local_buffer; + + //Perform bit-unshuffle + if (item_size == 1){ +// if ((gid==0) && (lid==0)) printf("bitunshuffle8"); + bitunshuffle8(local_dec, local_cmp, dec_size); + local_buffer=local_cmp; + } + else if (item_size == 2){ +// if ((gid==0) && (lid==0)) printf("bitunshuffle16"); + bitunshuffle16(local_dec, local_cmp, dec_size); + local_buffer=local_cmp; + } + else if (item_size == 4){ +// if ((gid==0) && (lid==0)) printf("bitunshuffle32"); + bitunshuffle32(local_dec, local_cmp, dec_size); + local_buffer=local_cmp; + } + else if (item_size == 8){ +// if ((gid==0) && (lid==0)) printf("bitunshuffle64"); + bitunshuffle64(local_dec, local_cmp, dec_size); + local_buffer=local_cmp; + } + else { + local_buffer = local_dec; + } + + + //Finally copy the destination data from local to global memory: + uint64_t start_write = LZ4_BLOCK_SIZE*gid; + barrier(CLK_LOCAL_MEM_FENCE); + for (uint32_t i=lid; i<dec_size; i+=wg){ + dec_dest[start_write + i] = local_buffer[i]; + } + + if (gid+1==get_num_groups(0)){ + uint64_t total_nbytes = load64_at(comp_src,0,SWAP_BE); + uint64_t end_write = dec_size + start_write; + int32_t remaining = total_nbytes - end_write; +// if (lid==0) printf("gid %u is last block has %u elements. Writing ends at %u/%lu, copy remaining %d\n",gid, dec_size, end_write, total_nbytes, remaining); + if ((remaining>0) && (remaining<item_size*8)){ + for (uint32_t i=lid; i<remaining; i++){ + dec_dest[end_write + i] = comp_src[end_read+i]; + } + } + } + +} + +// decompress a frame blockwise. +// block-start are searched by one thread from each workgroup ... not very efficient +// one workgroup treats on block. + +kernel void bslz4_decompress_frame( + global uint8_t* comp_src, + const uint64_t src_size, + global uint8_t* dec_dest, + const uint8_t item_size){ + + uint32_t gid = get_group_id(0); // One block is decompressed by one workgroup + uint32_t lid = get_local_id(0); // This is the thread position in the group... + uint32_t wg = get_local_size(0); // workgroup size + + local uint8_t local_cmp[LZ4_BLOCK_SIZE+LZ4_BLOCK_EXTRA]; + local uint8_t local_dec[LZ4_BLOCK_SIZE]; + local uint64_t block[2]; // will contain begining and end of the current block + + uint64_t start_read, end_read; + uint32_t cmp_buffer_size; + _lz4_unblock(comp_src, src_size, block); + start_read = block[0]; + end_read = block[1]; + cmp_buffer_size = end_read - start_read; + if (cmp_buffer_size == 0){ + if (lid == 0) printf("gid=%u: Empty buffer\n", gid); + return; + } + + // Copy locally the compressed buffer and memset the destination buffer + for (uint32_t i=lid; i<cmp_buffer_size; i+=wg){ + uint64_t read_pos = start_read + i; + if (read_pos<end_read) + local_cmp[i] = comp_src[read_pos]; + else + local_cmp[i] = 0; + } + for (uint32_t i=lid+cmp_buffer_size; i<LZ4_BLOCK_SIZE+LZ4_BLOCK_EXTRA; i+=wg){ + local_cmp[i] = 0; + } + for (uint32_t i=lid; i<LZ4_BLOCK_SIZE; i+=wg){ + local_dec[i] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + //All the work is performed here: + uint32_t dec_size; + dec_size = lz4_decompress_local_block( local_cmp, local_dec, cmp_buffer_size, LZ4_BLOCK_SIZE); + + barrier(CLK_LOCAL_MEM_FENCE); + local uint8_t* local_buffer; + + //Perform bit-unshuffle + if (item_size == 1){ +// if ((gid==0) && (lid==0)) printf("bitunshuffle8"); + bitunshuffle8(local_dec, local_cmp, dec_size); + local_buffer=local_cmp; + } + else if (item_size == 2){ +// if ((gid==0) && (lid==0)) printf("bitunshuffle16"); + bitunshuffle16(local_dec, local_cmp, dec_size); + local_buffer=local_cmp; + } + else if (item_size == 4){ +// if ((gid==0) && (lid==0)) printf("bitunshuffle32"); + bitunshuffle32(local_dec, local_cmp, dec_size); + local_buffer=local_cmp; + } + else if (item_size == 8){ +// if ((gid==0) && (lid==0)) printf("bitunshuffle64"); + bitunshuffle64(local_dec, local_cmp, dec_size); + local_buffer=local_cmp; + } + else { + local_buffer = local_dec; + } + + //Finally copy the destination data from local to global memory: + uint64_t start_write = LZ4_BLOCK_SIZE*gid; + barrier(CLK_LOCAL_MEM_FENCE); + for (uint32_t i=lid; i<dec_size; i+=wg){ + dec_dest[start_write + i] = local_buffer[i]; + } + +} diff --git a/src/silx/resources/opencl/codec/byte_offset.cl b/src/silx/resources/opencl/codec/byte_offset.cl new file mode 100644 index 0000000..56a24c4 --- /dev/null +++ b/src/silx/resources/opencl/codec/byte_offset.cl @@ -0,0 +1,235 @@ +/* + * Project: SILX: A data analysis tool-kit + * + * Copyright (C) 2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* To decompress CBF byte-offset compressed in parallel on GPU one needs to: + * - Set all values in mask and exception counter to zero. + * - Mark regions with exceptions and set values without exception. + * This generates the values (zeros for exceptions), the exception mask, + * counts the number of exception region and provides a start position for + * each exception. + * - Treat exceptions. For this, one thread in a workgoup treats a complete + * masked region in a serial fashion. All regions are treated in parallel. + * Values written at this stage are marked in the mask with -1. + * - Double scan: inclusive cum sum for values, exclusive cum sum to generate + * indices in output array. Values with mask = 1 are considered as 0. + * - Compact and copy output by removing duplicated values in exceptions. + */ + +kernel void mark_exceptions(global char* raw, + int size, + int full_size, + global int* mask, + global int* values, + global int* cnt, + global int* exc) +{ + int gid; + gid = get_global_id(0); + if (gid<size) + { + int value, position; + value = raw[gid]; + if (value == -128) + { + int maxi; + values[gid] = 0; + position = atomic_inc(cnt); + exc[position] = gid; + maxi = size - 1; + mask[gid] = 1; + mask[min(maxi, gid+1)] = 1; + mask[min(maxi, gid+2)] = 1; + + if (((int) raw[min(gid+1, maxi)] == 0) && + ((int) raw[min(gid+2, maxi)] == -128)) + { + mask[min(maxi, gid+3)] = 1; + mask[min(maxi, gid+4)] = 1; + mask[min(maxi, gid+5)] = 1; + mask[min(maxi, gid+6)] = 1; + } + } + else + { // treat simple data + + values[gid] = value; + } + } + else if (gid<full_size) + { + mask[gid]=1; + values[gid] = 0; + } +} + +//run with WG=1, as may as exceptions +kernel void treat_exceptions(global char* raw, //raw compressed stream + int size, //size of the raw compressed stream + global int* mask, //tells if the value is masked + global int* exc, //array storing the position of the start of exception zones + global int* values)// stores decompressed values. +{ + int gid = get_global_id(0); + int inp_pos = exc[gid]; + if ((inp_pos<=0) || ((int)mask[inp_pos - 1] == 0)) + { + int value, is_masked, next_value, inc; + is_masked = (mask[inp_pos] != 0); + while ((is_masked) && (inp_pos<size)) + { + value = (int) raw[inp_pos]; + if (value == -128) + { // this correspond to 16 bits exception + uchar low_byte = raw[inp_pos+1]; + char high_byte = raw[inp_pos+2] ; + next_value = high_byte<<8 | low_byte; + if (next_value == -32768) + { // this correspond to 32 bits exception + uchar low_byte1 = raw[inp_pos+3], + low_byte2 = raw[inp_pos+4], + low_byte3 = raw[inp_pos+5]; + char high_byte4 = raw[inp_pos+6] ; + value = high_byte4<<24 | low_byte3<<16 | low_byte2<<8 | low_byte1; + inc = 7; + } + else + { + value = next_value; + inc = 3; + } + } + else + { + inc = 1; + } + values[inp_pos] = value; + mask[inp_pos] = -1; // mark the processed data as valid in the mask + inp_pos += inc; + is_masked = (mask[inp_pos] != 0); + } + } +} + +// copy the values of the elements to definitive position +kernel void copy_result_int(global int* values, + global int* indexes, + int in_size, + int out_size, + global int* output + ) +{ + int gid = get_global_id(0); + if (gid < in_size) + { + int current = max(indexes[gid], 0), + next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1]; + //we keep always the last element + if ((current <= out_size) && (current < next)) + { + output[current] = values[gid]; + } + } +} + +// copy the values of the elements to definitive position +kernel void copy_result_float(global int* values, + global int* indexes, + int in_size, + int out_size, + global float* output + ) +{ + int gid = get_global_id(0); + if (gid<in_size) + { + int current = max(indexes[gid], 0), + next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1]; + if ((current < out_size) && (current < next)) + { + output[current] = (float) values[gid]; + } + } +} + + +// combined memset for all arrays used for Byte Offset decompression +kernel void byte_offset_memset(global char* raw, + global int* mask, + global int* index, + global int* result, + int full_size, + int actual_size + ) +{ + int gid = get_global_id(0); + if (gid < full_size) + { + raw[gid] = 0; + index[gid] = 0; + result[gid] = 0; + if (gid<actual_size) + { + mask[gid] = 0; + } + else + { + mask[gid] = 1; + } + + } +} + + +//Simple memset kernel for char arrays +kernel void fill_char_mem(global char* ary, + int size, + char pattern, + int start_at) +{ + int gid = get_global_id(0); + if ((gid >= start_at) && (gid < size)) + { + ary[gid] = pattern; + } +} + +//Simple memset kernel for int arrays +kernel void fill_int_mem(global int* ary, + int size, + int pattern, + int start_at) +{ + int gid = get_global_id(0); + if ((gid >= start_at) && (gid < size)) + { + ary[gid] = pattern; + } +} + diff --git a/src/silx/resources/opencl/convolution.cl b/src/silx/resources/opencl/convolution.cl new file mode 100644 index 0000000..629b8fc --- /dev/null +++ b/src/silx/resources/opencl/convolution.cl @@ -0,0 +1,312 @@ +#define MAX_CONST_SIZE 16384 + +/******************************************************************************/ +/**************************** Macros ******************************************/ +/******************************************************************************/ + +// Get the center index of the filter, +// and the "half-Left" and "half-Right" lengths. +// In the case of an even-sized filter, the center is shifted to the left. +#define GET_CENTER_HL(hlen){\ + if (hlen & 1) {\ + c = hlen/2;\ + hL = c;\ + hR = c;\ + }\ + else {\ + c = hlen/2 - 1;\ + hL = c;\ + hR = c+1;\ + }\ +}\ + +// Boundary handling modes +#define CONV_MODE_REFLECT 0 // cba|abcd|dcb +#define CONV_MODE_NEAREST 1 // aaa|abcd|ddd +#define CONV_MODE_WRAP 2 // bcd|abcd|abc +#define CONV_MODE_CONSTANT 3 // 000|abcd|000 +#ifndef USED_CONV_MODE + #define USED_CONV_MODE CONV_MODE_NEAREST +#endif + +#define CONV_PERIODIC_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x += Nx; if (idx_x >= Nx) idx_x -= Nx; +#define CONV_PERIODIC_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y += Ny; if (idx_y >= Ny) idx_y -= Ny; +#define CONV_PERIODIC_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z += Nz; if (idx_z >= Nz) idx_z -= Nz; + +#define CONV_NEAREST_IDX_X int idx_x = clamp((int) (gidx - c + jx), 0, Nx-1); +#define CONV_NEAREST_IDX_Y int idx_y = clamp((int) (gidy - c + jy), 0, Ny-1); +#define CONV_NEAREST_IDX_Z int idx_z = clamp((int) (gidz - c + jz), 0, Nz-1); + +#define CONV_REFLECT_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x = -idx_x-1; if (idx_x >= Nx) idx_x = Nx-(idx_x-(Nx-1)); +#define CONV_REFLECT_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y = -idx_y-1; if (idx_y >= Ny) idx_y = Ny-(idx_y-(Ny-1)); +#define CONV_REFLECT_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z = -idx_z-1; if (idx_z >= Nz) idx_z = Nz-(idx_z-(Nz-1)); + + +#if USED_CONV_MODE == CONV_MODE_REFLECT + #define CONV_IDX_X CONV_REFLECT_IDX_X + #define CONV_IDX_Y CONV_REFLECT_IDX_Y + #define CONV_IDX_Z CONV_REFLECT_IDX_Z +#elif USED_CONV_MODE == CONV_MODE_NEAREST + #define CONV_IDX_X CONV_NEAREST_IDX_X + #define CONV_IDX_Y CONV_NEAREST_IDX_Y + #define CONV_IDX_Z CONV_NEAREST_IDX_Z +#elif USED_CONV_MODE == CONV_MODE_WRAP + #define CONV_IDX_X CONV_PERIODIC_IDX_X + #define CONV_IDX_Y CONV_PERIODIC_IDX_Y + #define CONV_IDX_Z CONV_PERIODIC_IDX_Z +#elif USED_CONV_MODE == CONV_MODE_CONSTANT + #error "constant not implemented yet" +#else + #error "Unknown convolution mode" +#endif + + + +// Image access patterns +#define READ_IMAGE_1D_X input[(gidz*Ny + gidy)*Nx + idx_x] +#define READ_IMAGE_1D_Y input[(gidz*Ny + idx_y)*Nx + gidx] +#define READ_IMAGE_1D_Z input[(idx_z*Ny + gidy)*Nx + gidx] + +#define READ_IMAGE_2D_XY input[(gidz*Ny + idx_y)*Nx + idx_x] +#define READ_IMAGE_2D_XZ input[(idx_z*Ny + gidy)*Nx + idx_x] +#define READ_IMAGE_2D_YZ input[(idx_z*Ny + idx_y)*Nx + gidx] + +#define READ_IMAGE_3D_XYZ input[(idx_z*Ny + idx_y)*Nx + idx_x] + + + +/******************************************************************************/ +/**************************** 1D Convolution **********************************/ +/******************************************************************************/ + + +// Convolution with 1D kernel along axis "X" (fast dimension) +// Works for batched 1D on 2D and batched 2D on 3D, along axis "X". +__kernel void convol_1D_X( + const __global float * input, + __global float * output, + __global float * filter, + int L, // filter size + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(L); + float sum = 0.0f; + + for (int jx = 0; jx <= hR+hL; jx++) { + CONV_IDX_X; // Get index "x" + sum += READ_IMAGE_1D_X * filter[L-1 - jx]; + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} + + +// Convolution with 1D kernel along axis "Y" +// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y". +__kernel void convol_1D_Y( + const __global float * input, + __global float * output, + __global float * filter, + int L, // filter size + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(L); + float sum = 0.0f; + + for (int jy = 0; jy <= hR+hL; jy++) { + CONV_IDX_Y; // Get index "y" + sum += READ_IMAGE_1D_Y * filter[L-1 - jy]; + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} + + +// Convolution with 1D kernel along axis "Z" +// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z". +__kernel void convol_1D_Z( + const __global float * input, + __global float * output, + __global float * filter, + int L, // filter size + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(L); + float sum = 0.0f; + + for (int jz = 0; jz <= hR+hL; jz++) { + CONV_IDX_Z; // Get index "z" + sum += READ_IMAGE_1D_Z * filter[L-1 - jz]; + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} + + +/******************************************************************************/ +/**************************** 2D Convolution **********************************/ +/******************************************************************************/ + +// Convolution with 2D kernel +// Works for batched 2D on 3D. +__kernel void convol_2D_XY( + const __global float * input, + __global float * output, + __global float * filter, + int Lx, // filter number of columns, + int Ly, // filter number of rows, + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Lx); + float sum = 0.0f; + + for (int jy = 0; jy <= hR+hL; jy++) { + CONV_IDX_Y; // Get index "y" + for (int jx = 0; jx <= hR+hL; jx++) { + CONV_IDX_X; // Get index "x" + sum += READ_IMAGE_2D_XY * filter[(Ly-1-jy)*Lx + (Lx-1 - jx)]; + } + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} + + +// Convolution with 2D kernel +// Works for batched 2D on 3D. +__kernel void convol_2D_XZ( + const __global float * input, + __global float * output, + __global float * filter, + int Lx, // filter number of columns, + int Lz, // filter number of rows, + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Lx); + float sum = 0.0f; + + for (int jz = 0; jz <= hR+hL; jz++) { + CONV_IDX_Z; // Get index "z" + for (int jx = 0; jx <= hR+hL; jx++) { + CONV_IDX_X; // Get index "x" + sum += READ_IMAGE_2D_XZ * filter[(Lz-1-jz)*Lx + (Lx-1 - jx)]; + } + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} + + +// Convolution with 2D kernel +// Works for batched 2D on 3D. +__kernel void convol_2D_YZ( + const __global float * input, + __global float * output, + __global float * filter, + int Ly, // filter number of columns, + int Lz, // filter number of rows, + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Ly); + float sum = 0.0f; + + for (int jz = 0; jz <= hR+hL; jz++) { + CONV_IDX_Z; // Get index "z" + for (int jy = 0; jy <= hR+hL; jy++) { + CONV_IDX_Y; // Get index "y" + sum += READ_IMAGE_2D_YZ * filter[(Lz-1-jz)*Ly + (Ly-1 - jy)]; + } + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} + + + +/******************************************************************************/ +/**************************** 3D Convolution **********************************/ +/******************************************************************************/ + +// Convolution with 3D kernel +__kernel void convol_3D_XYZ( + const __global float * input, + __global float * output, + __global float * filter, + int Lx, // filter number of columns, + int Ly, // filter number of rows, + int Lz, // filter number of rows, + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Lx); + float sum = 0.0f; + + for (int jz = 0; jz <= hR+hL; jz++) { + CONV_IDX_Z; // Get index "z" + for (int jy = 0; jy <= hR+hL; jy++) { + CONV_IDX_Y; // Get index "y" + for (int jx = 0; jx <= hR+hL; jx++) { + CONV_IDX_X; // Get index "x" + sum += READ_IMAGE_3D_XYZ * filter[((Lz-1-jz)*Ly + (Ly-1-jy))*Lx + (Lx-1 - jx)]; + } + } + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} + diff --git a/src/silx/resources/opencl/convolution_textures.cl b/src/silx/resources/opencl/convolution_textures.cl new file mode 100644 index 0000000..517a67c --- /dev/null +++ b/src/silx/resources/opencl/convolution_textures.cl @@ -0,0 +1,374 @@ +/******************************************************************************/ +/**************************** Macros ******************************************/ +/******************************************************************************/ + +// Error handling +#ifndef IMAGE_DIMS + #error "IMAGE_DIMS must be defined" +#endif +#ifndef FILTER_DIMS + #error "FILTER_DIMS must be defined" +#endif +#if FILTER_DIMS > IMAGE_DIMS + #error "Filter cannot have more dimensions than image" +#endif + +// Boundary handling modes +#define CONV_MODE_REFLECT 0 // CLK_ADDRESS_MIRRORED_REPEAT : cba|abcd|dcb +#define CONV_MODE_NEAREST 1 // CLK_ADDRESS_CLAMP_TO_EDGE : aaa|abcd|ddd +#define CONV_MODE_WRAP 2 // CLK_ADDRESS_REPEAT : bcd|abcd|abc +#define CONV_MODE_CONSTANT 3 // CLK_ADDRESS_CLAMP : 000|abcd|000 +#ifndef USED_CONV_MODE + #define USED_CONV_MODE CONV_MODE_NEAREST +#endif +#if USED_CONV_MODE == CONV_MODE_REFLECT + #define CLK_BOUNDARY CLK_ADDRESS_MIRRORED_REPEAT + #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE + #define USE_NORM_COORDS +#elif USED_CONV_MODE == CONV_MODE_NEAREST + #define CLK_BOUNDARY CLK_ADDRESS_CLAMP_TO_EDGE + #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE +#elif USED_CONV_MODE == CONV_MODE_WRAP + #define CLK_BOUNDARY CLK_ADDRESS_REPEAT + #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE + #define USE_NORM_COORDS +#elif USED_CONV_MODE == CONV_MODE_CONSTANT + #define CLK_BOUNDARY CLK_ADDRESS_CLAMP + #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE +#else + #error "Unknown convolution mode" +#endif + + + +// Convolution index for filter +#define FILTER_INDEX(j) (Lx - 1 - j) + +// Filter access patterns +#define READ_FILTER_1D(j) read_imagef(filter, (int2) (FILTER_INDEX(j), 0)).x; +#define READ_FILTER_2D(jx, jy) read_imagef(filter, (int2) (FILTER_INDEX(jx), FILTER_INDEX(jy))).x; +#define READ_FILTER_3D(jx, jy, jz) read_imagef(filter, (int4) (FILTER_INDEX(jx), FILTER_INDEX(jy), FILTER_INDEX(jz), 0)).x; + + +// Convolution index for image +#ifdef USE_NORM_COORDS + #define IMAGE_INDEX_X (gidx*1.0f +0.5f - c + jx)/Nx + #define IMAGE_INDEX_Y (gidy*1.0f +0.5f - c + jy)/Ny + #define IMAGE_INDEX_Z (gidz*1.0f +0.5f - c + jz)/Nz + #define RET_TYPE_1 float + #define RET_TYPE_2 float2 + #define RET_TYPE_4 float4 + #define C_ZERO 0.5f + #define GIDX (gidx*1.0f + 0.5f)/Nx + #define GIDY (gidy*1.0f + 0.5f)/Ny + #define GIDZ (gidz*1.0f + 0.5f)/Nz +#else + #define IMAGE_INDEX_X (gidx - c + jx) + #define IMAGE_INDEX_Y (gidy - c + jy) + #define IMAGE_INDEX_Z (gidz - c + jz) + #define RET_TYPE_1 int + #define RET_TYPE_2 int2 + #define RET_TYPE_4 int4 + #define C_ZERO 0 + #define GIDX gidx + #define GIDY gidy + #define GIDZ gidz +#endif + +static const sampler_t sampler = CLK_COORDS | CLK_BOUNDARY | CLK_FILTER_NEAREST; + +// Image access patterns +#define READ_IMAGE_1D read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, C_ZERO)).x + +#define READ_IMAGE_2D_X read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X , GIDY)).x +#define READ_IMAGE_2D_Y read_imagef(input, sampler, (RET_TYPE_2) (GIDX, IMAGE_INDEX_Y)).x +#define READ_IMAGE_2D_XY read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, IMAGE_INDEX_Y)).x + +#define READ_IMAGE_3D_X read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, GIDZ, C_ZERO)).x +#define READ_IMAGE_3D_Y read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x +#define READ_IMAGE_3D_Z read_imagef(input, sampler, (RET_TYPE_4) (GIDX, GIDY, IMAGE_INDEX_Z, C_ZERO)).x +#define READ_IMAGE_3D_XY read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x +#define READ_IMAGE_3D_XZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, IMAGE_INDEX_Z, C_ZERO)).x +#define READ_IMAGE_3D_YZ read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x +#define READ_IMAGE_3D_XYZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x + + +// NOTE: pyopencl and OpenCL < 1.2 do not support image1d_t +#if FILTER_DIMS == 1 + #define FILTER_TYPE image2d_t + #define READ_FILTER_VAL(j) READ_FILTER_1D(j) +#elif FILTER_DIMS == 2 + #define FILTER_TYPE image2d_t + #define READ_FILTER_VAL(jx, jy) READ_FILTER_2D(jx, jy) +#elif FILTER_DIMS == 3 + #define FILTER_TYPE image3d_t + #define READ_FILTER_VAL(jx, jy, jz) READ_FILTER_3D(jx, jy, jz) +#endif + +#if IMAGE_DIMS == 1 + #define IMAGE_TYPE image2d_t + #define READ_IMAGE_X READ_IMAGE_1D +#elif IMAGE_DIMS == 2 + #define IMAGE_TYPE image2d_t + #define READ_IMAGE_X READ_IMAGE_2D_X + #define READ_IMAGE_Y READ_IMAGE_2D_Y + #define READ_IMAGE_XY READ_IMAGE_2D_XY +#elif IMAGE_DIMS == 3 + #define IMAGE_TYPE image3d_t + #define READ_IMAGE_X READ_IMAGE_3D_X + #define READ_IMAGE_Y READ_IMAGE_3D_Y + #define READ_IMAGE_Z READ_IMAGE_3D_Z + #define READ_IMAGE_XY READ_IMAGE_3D_XY + #define READ_IMAGE_XZ READ_IMAGE_3D_XZ + #define READ_IMAGE_YZ READ_IMAGE_3D_YZ + #define READ_IMAGE_XYZ READ_IMAGE_3D_XYZ +#endif + + +// Get the center index of the filter, +// and the "half-Left" and "half-Right" lengths. +// In the case of an even-sized filter, the center is shifted to the left. +#define GET_CENTER_HL(hlen){\ + if (hlen & 1) {\ + c = hlen/2;\ + hL = c;\ + hR = c;\ + }\ + else {\ + c = hlen/2 - 1;\ + hL = c;\ + hR = c+1;\ + }\ +}\ + + + +/******************************************************************************/ +/**************************** 1D Convolution **********************************/ +/******************************************************************************/ + +#if FILTER_DIMS == 1 +// Convolution with 1D kernel along axis "X" (fast dimension) +// Works for batched 1D on 2D and batched 2D on 3D, along axis "X". +__kernel void convol_1D_X_tex( + read_only IMAGE_TYPE input, + __global float * output, + read_only FILTER_TYPE filter, + int Lx, // filter size + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Lx); + float sum = 0.0f; + + for (int jx = 0; jx <= hR+hL; jx++) { + sum += READ_IMAGE_X * READ_FILTER_VAL(jx); + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} + + +#if IMAGE_DIMS >= 2 +// Convolution with 1D kernel along axis "Y" +// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y". +__kernel void convol_1D_Y_tex( + read_only IMAGE_TYPE input, + __global float * output, + read_only FILTER_TYPE filter, + int Lx, // filter size + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Lx); + float sum = 0.0f; + + for (int jy = 0; jy <= hR+hL; jy++) { + sum += READ_IMAGE_Y * READ_FILTER_VAL(jy); + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} +#endif + +#if IMAGE_DIMS == 3 +// Convolution with 1D kernel along axis "Z" +// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z". +__kernel void convol_1D_Z_tex( + read_only IMAGE_TYPE input, + __global float * output, + read_only FILTER_TYPE filter, + int Lx, // filter size + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Lx); + float sum = 0.0f; + + for (int jz = 0; jz <= hR+hL; jz++) { + sum += READ_IMAGE_Z * READ_FILTER_VAL(jz); + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} +#endif +#endif + +/******************************************************************************/ +/**************************** 2D Convolution **********************************/ +/******************************************************************************/ + +#if IMAGE_DIMS >= 2 && FILTER_DIMS == 2 +// Convolution with 2D kernel +// Works for batched 2D on 3D. +__kernel void convol_2D_XY_tex( + read_only IMAGE_TYPE input, + __global float * output, + read_only FILTER_TYPE filter, + int Lx, // filter number of columns, + int Ly, // filter number of rows, + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Lx); + float sum = 0.0f; + + for (int jy = 0; jy <= hR+hL; jy++) { + for (int jx = 0; jx <= hR+hL; jx++) { + sum += READ_IMAGE_XY * READ_FILTER_VAL(jx, jy); + } + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} +#endif + +#if IMAGE_DIMS == 3 && FILTER_DIMS == 2 +// Convolution with 2D kernel +// Works for batched 2D on 3D. +__kernel void convol_2D_XZ_tex( + read_only IMAGE_TYPE input, + __global float * output, + read_only FILTER_TYPE filter, + int Lx, // filter number of columns, + int Lz, // filter number of rows, + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Lx); + float sum = 0.0f; + + for (int jz = 0; jz <= hR+hL; jz++) { + for (int jx = 0; jx <= hR+hL; jx++) { + sum += READ_IMAGE_XZ * READ_FILTER_VAL(jx, jz); + } + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} + + +// Convolution with 2D kernel +// Works for batched 2D on 3D. +__kernel void convol_2D_YZ_tex( + read_only IMAGE_TYPE input, + __global float * output, + read_only FILTER_TYPE filter, + int Lx, // filter number of columns, + int Lz, // filter number of rows, + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Lx); + float sum = 0.0f; + + for (int jz = 0; jz <= hR+hL; jz++) { + for (int jy = 0; jy <= hR+hL; jy++) { + sum += READ_IMAGE_YZ * READ_FILTER_VAL(jy, jz); + } + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} +#endif + + +/******************************************************************************/ +/**************************** 3D Convolution **********************************/ +/******************************************************************************/ + +#if IMAGE_DIMS == 3 && FILTER_DIMS == 3 +// Convolution with 3D kernel +__kernel void convol_3D_XYZ_tex( + read_only IMAGE_TYPE input, + __global float * output, + read_only FILTER_TYPE filter, + int Lx, // filter number of columns, + int Ly, // filter number of rows, + int Lz, // filter number of rows, + int Nx, // input/output number of columns + int Ny, // input/output number of rows + int Nz // input/output depth +) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return; + + int c, hL, hR; + GET_CENTER_HL(Lx); + float sum = 0.0f; + + for (int jz = 0; jz <= hR+hL; jz++) { + for (int jy = 0; jy <= hR+hL; jy++) { + for (int jx = 0; jx <= hR+hL; jx++) { + sum += READ_IMAGE_XYZ * READ_FILTER_VAL(jx, jy, jz); + } + } + } + output[(gidz*Ny + gidy)*Nx + gidx] = sum; +} +#endif diff --git a/src/silx/resources/opencl/doubleword.cl b/src/silx/resources/opencl/doubleword.cl new file mode 100644 index 0000000..02a8aba --- /dev/null +++ b/src/silx/resources/opencl/doubleword.cl @@ -0,0 +1,122 @@ +/* + * OpenCL library for double word floating point calculation using compensated arithmetics + * + * The theoritical basis can be found in Valentina Popescu's PhD thesis: + * Towards fast and certified multi-precision libraries + * Reference LYSEN036 + * http://www.theses.fr/2017LYSEN036 + * All page number and equation number are refering to this document. + * + * The precision of the calculation (bounds) is provided in ULP (smallest possible mantissa) + * and come from the table 2.2 (page 68 of the thesis). + * The number of equivalent FLOP is taken from the table 2.3 (page 69 the thesis). + * Note that FLOP are not all equal: a division is much more expensive than an addition. + */ + +//This library can be expanded to double-double by redefining fp, fp2 and one to double, double2 and 1.0. +#ifdef DOUBLEDOUBLE +#define fp double +#define fp2 double2 +#define one 1.0 +#else +#define fp float +#define fp2 float2 +#define one 1.0f +#endif + +/* Nota: i386 computer use x87 registers which are larger than the 32bits precision + * which can invalidate the error compensation mechanism. + * + * We use the trick to declare some variable "volatile" to enforce the actual + * precision reduction of those variables. + * This has to be used in combination with #pragma clang fp contract(on) +*/ + +#ifndef X87_VOLATILE +# define X87_VOLATILE +#endif + +//Algorithm 1, p23, theorem 1.1.12. Requires e_x > e_y, valid if |x| > |y| +inline fp2 fast_fp_plus_fp(fp x, fp y){ + #pragma clang fp contract(on) + X87_VOLATILE fp s = x + y; + X87_VOLATILE fp z = s - x; + fp e = y - z; + return (fp2)(s, e); +} + +//Algorithm 2, p24, same as fast_fp_plus_fp without the condition on e_x and e_y +inline fp2 fp_plus_fp(fp x, fp y){ + #pragma clang fp contract(on) + X87_VOLATILE fp s = x + y; + X87_VOLATILE fp xp = s - y; + X87_VOLATILE fp yp = s - xp; + X87_VOLATILE fp dx = x - xp; + X87_VOLATILE fp dy = y - yp; + return (fp2)(s, dx+dy); +} + +//Algorithm 3, p24: multiplication with a FMA +inline fp2 fp_times_fp(fp x, fp y){ + fp p = x * y; + fp e = fma(x, y, -p); + return (fp2)(p, e); +} + +//Algorithm 7, p38: Addition of a FP to a DW. 10flop bounds:2u²+5u³ +inline fp2 dw_plus_fp(fp2 x, fp y){ + #pragma clang fp contract(on) + fp2 s = fp_plus_fp(x.s0, y); + X87_VOLATILE fp v = x.s1 + s.s1; + return fast_fp_plus_fp(s.s0, v); +} + +//Algorithm 9, p40: addition of two DW: 20flop bounds:3u²+13u³ +inline fp2 dw_plus_dw(fp2 x, fp2 y){ + fp2 s = fp_plus_fp(x.s0, y.s0); + fp2 t = fp_plus_fp(x.s1, y.s1); + fp2 v = fast_fp_plus_fp(s.s0, s.s1 + t.s0); + return fast_fp_plus_fp(v.s0, t.s1 + v.s1); +} + +//Algorithm 12, p49: Multiplication FP*DW: 6flops bounds:2u² +inline fp2 dw_times_fp(fp2 x, fp y){ + fp2 c = fp_times_fp(x.s0, y); + return fast_fp_plus_fp(c.s0, fma(x.s1, y, c.s1)); +} + +//Algorithm 14, p52: Multiplication DW*DW, 8 flops bounds:6u² +inline fp2 dw_times_dw(fp2 x, fp2 y){ + #pragma clang fp contract(on) + fp2 c = fp_times_fp(x.s0, y.s0); + X87_VOLATILE fp l = fma(x.s1, y.s0, x.s0 * y.s1); + return fast_fp_plus_fp(c.s0, c.s1 + l); +} + +//Algorithm 17, p55: Division DW / FP, 10flops bounds: 3.5u² +inline fp2 dw_div_fp(fp2 x, fp y){ + #pragma clang fp contract(on) + X87_VOLATILE fp th = x.s0 / y; + fp2 pi = fp_times_fp(th, y); + fp2 d = x - pi; + X87_VOLATILE fp delta = d.s0 + d.s1; + X87_VOLATILE fp tl = delta/y; + return fast_fp_plus_fp(th, tl); +} + +//Derived from algorithm 20, p64: Inversion 1/ DW, 22 flops +inline fp2 inv_dw(fp2 y){ + #pragma clang fp contract(on) + X87_VOLATILE fp th = one/y.s0; + X87_VOLATILE fp rh = fma(-y.s0, th, one); + X87_VOLATILE fp rl = -y.s1 * th; + fp2 e = fast_fp_plus_fp(rh, rl); + fp2 delta = dw_times_fp(e, th); + return dw_plus_fp(delta, th); +} + +//Algorithm 20, p64: Division DW / DW, 30 flops: bounds:9.8u² +inline fp2 dw_div_dw(fp2 x, fp2 y){ + return dw_times_dw(x, inv_dw(y)); +} + diff --git a/src/silx/resources/opencl/image/cast.cl b/src/silx/resources/opencl/image/cast.cl new file mode 100644 index 0000000..9e23a82 --- /dev/null +++ b/src/silx/resources/opencl/image/cast.cl @@ -0,0 +1,181 @@ +/* + * Project: SILX: Alogorithms for image processing + * + * Copyright (C) 2013-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef NB_COLOR + #define NB_COLOR 1 +#endif + + +/** + * \brief Cast values of an array of uint8 into a float output array. + * + * :param array_input: Pointer to global memory with the input data as unsigned8 array + * :param array_float: Pointer to global memory with the output data as float array + * :param width: Width of the image + * :param height: Height of the image + */ +kernel void u8_to_float( global unsigned char *array_input, + global float *array_float, + const int width, + const int height) +{ + //Global memory guard for padding + if ((get_global_id(0) < width) && (get_global_id(1)<height)) + { + int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); + for (int c=0; c<NB_COLOR; c++) + { + array_float[i + c] = (float) array_input[i + c]; + } //end loop over colors + } //end test in image +} //end kernel + +/** + * \brief Cast values of an array of uint8 into a float output array. + * + * :param array_input: Pointer to global memory with the input data as signed8 array + * :param array_float: Pointer to global memory with the output data as float array + * :param width: Width of the image + * :param height: Height of the image + */ +kernel void s8_to_float( global char *array_input, + global float *array_float, + const int width, + const int height) +{ + //Global memory guard for padding + if ((get_global_id(0) < width) && (get_global_id(1)<height)) + { + int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); + for (int c=0; c<NB_COLOR; c++) + { + array_float[i + c] = (float) array_input[i + c]; + } //end loop over colors + } //end test in image +} //end kernel + + +/** + * \brief cast values of an array of uint16 into a float output array. + * + * :param array_input: Pointer to global memory with the input data as unsigned16 array + * :param array_float: Pointer to global memory with the output data as float array + * :param width: Width of the image + * :param height: Height of the image + */ +kernel void u16_to_float(global unsigned short *array_input, + global float *array_float, + const int width, + const int height) +{ + //Global memory guard for padding + if ((get_global_id(0) < width) && (get_global_id(1)<height)) + { + int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); + for (int c=0; c<NB_COLOR; c++) + { + array_float[i + c] = (float) array_input[i + c]; + } //end loop over colors + } //end test in image +} //end kernel + +/** + * \brief cast values of an array of int16 into a float output array. + * + * :param array_input: Pointer to global memory with the input data as signed16 array + * :param array_float: Pointer to global memory with the output data as float array + * :param width: Width of the image + * :param height: Height of the image + */ +kernel void s16_to_float(global short *array_input, + global float *array_float, + const int width, + const int height) +{ + //Global memory guard for padding + if ((get_global_id(0) < width) && (get_global_id(1)<height)) + { + int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); + for (int c=0; c<NB_COLOR; c++) + { + array_float[i + c] = (float) array_input[i + c]; + } //end loop over colors + } //end test in image +}//end kernel + +/** + * \brief cast values of an array of uint32 into a float output array. + * + * :param array_input: Pointer to global memory with the input data as unsigned32 array + * :param array_float: Pointer to global memory with the output data as float array + * :param width: Width of the image + * :param height: Height of the image + */ +kernel void u32_to_float(global unsigned int *array_input, + global float *array_float, + const int width, + const int height) +{ + //Global memory guard for padding + if ((get_global_id(0) < width) && (get_global_id(1)<height)) + { + int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); + for (int c=0; c<NB_COLOR; c++) + { + array_float[i + c] = (float) array_input[i + c]; + } //end loop over colors + } //end test in image +}//end kernel + + +/** + * \brief convert values of an array of int32 into a float output array. + * + * :param array_input: Pointer to global memory with the data in int + * :param array_float: Pointer to global memory with the data in float + * :param width: Width of the image + * :param height: Height of the image + */ +kernel void s32_to_float(global int *array_input, + global float *array_float, + const int width, + const int height) +{ + //Global memory guard for padding + if ((get_global_id(0) < width) && (get_global_id(1)<height)) + { + int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1)); + for (int c=0; c<NB_COLOR; c++) + { + array_float[i + c] = (float) array_input[i + c]; + } //end loop over colors + } //end test in image +}//end kernel + diff --git a/src/silx/resources/opencl/image/histogram.cl b/src/silx/resources/opencl/image/histogram.cl new file mode 100644 index 0000000..7fb1485 --- /dev/null +++ b/src/silx/resources/opencl/image/histogram.cl @@ -0,0 +1,178 @@ +//CL// + +/* + * Project: SILX: Alogorithms for image processing + * + * Copyright (C) 2013-2018 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* Single kernel histogram for float array. + * + * Can perform histograming in log-scale (using the arcsinh) + +Parameters: + - data: buffer with the image content in float (input) + - data_size: input + - mini: Lower bound of the first bin + - maxi: upper bouns of the last bin + - map_operation: Type of pre-processor to use. if if set to !=0, use log scale + - hist: resulting histogram (ouptut) + - hist_size: number of bins + - tmp_hist: temporary storage of size hist_size*num_groups + - processed: temporary storage of size 1 initialially set to 0 + - local_hist: local storage of size hist_size + + +Grid information: + * use the largest WG size. If it is larger than the number of bins (hist_size), + take the power of 2 immediately above + *Schedule as many WG as the device has compute engines. No need to schuedule more, + it is just a waist of memory + * The histogram should fit in shared (local) memory and tmp_hist can be large! + +Assumes: + hist and local_hist have size hist_size + edges has size hist_size+2 + tmp_hist has size hist_size*num_groups + processed is of size one and initially set to 0 + +*/ + + +static float preprocess(float value, int code) +{ + //This function can be modified to have more scales + if (code!=0) + { + return asinh(value); + } + else + { + return value; + } +} + +kernel void histogram(global float *data, + int data_size, + float mini, + float maxi, + int map_operation, + global int *hist, + global float *edges, + int hist_size, + global int *tmp_hist, + global int *processed, + local int *local_hist) +{ + // each thread + int lid = get_local_id(0); + int ws = get_local_size(0); + int nb_engine = get_num_groups(0); + int engine_id = get_group_id(0); + + // memset the local_hist array + for (int i=0; i<hist_size; i+=ws) + { + int j = i + lid; + if (j<hist_size) + { + local_hist[j] = 0; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Process the local data and accumulate in shared memory + int bloc_size = (int) ceil((float)data_size/(float)nb_engine); + int start = bloc_size * engine_id; + int stop = min(start + bloc_size, data_size); + float vmini = preprocess(mini, map_operation); + float vscale = (float)hist_size/(preprocess(maxi, map_operation) -vmini); + for (int i = start; i<stop; i+=ws) + { + int address = i + lid; + if (address < stop) + { + float value = data[address]; + if ((!isnan(value)) && (value>=mini) && (value<=maxi)) + { + float vvalue = (preprocess(value, map_operation)-vmini)*vscale; + int idx = clamp((int) vvalue, 0, hist_size - 1); + atomic_inc(&local_hist[idx]); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + //Now copy result into the right place and reset the first value of the shared array + for (int i=0; i<hist_size; i+=ws) + { + int j = i + lid; + if (j<hist_size) + { + tmp_hist[hist_size * engine_id + j] = local_hist[j]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + local_hist[0] = 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + //Increment the system wide shared variable processed and share the result with the workgroup + if (lid == 0) + { + local_hist[0] = atomic_inc(processed); + } + barrier(CLK_LOCAL_MEM_FENCE); + + // If we are the engine last to work, perform the concatenation of all results + + if ((local_hist[0] + 1) == nb_engine) + { + for (int i=0; i<hist_size; i+=ws) + { + int j = i + lid; + int lsum = 0; + if (j<hist_size) + { + for (int k=0; k<nb_engine; k++) + { + lsum += tmp_hist[hist_size * k + j]; + } + hist[j] = lsum; + edges[j] = vmini + j/vscale; + } + } + // Finally reset the counter + if (lid == 0) + { + processed[0] = 0; + edges[hist_size] = vmini + hist_size/vscale;; + } + + } +} diff --git a/src/silx/resources/opencl/image/map.cl b/src/silx/resources/opencl/image/map.cl new file mode 100644 index 0000000..804a5a1 --- /dev/null +++ b/src/silx/resources/opencl/image/map.cl @@ -0,0 +1,85 @@ +/* + * Project: SILX: Data analysis library + * kernel for maximum and minimum calculation + * + * + * Copyright (C) 2013-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * + */ + +#ifndef NB_COLOR + #define NB_COLOR 1 +#endif + + +/** + * \brief Linear scale of signal in an image (maybe multi-color) + * + * :param image: contains the input image + * :param output: contains the output image after scaling + * :param max_min_input: 2-floats containing the maximum and the minimum value of image + * :param minimum: float scalar with the minimum of the output + * :param maximum: float scalar with the maximum of the output + * :param width: integer, number of columns the matrices + * :param height: integer, number of lines of the matrices + * + * + */ + + +kernel void normalize_image(global float* image, + global float* output, + int width, + int height, + global float* max_min_input, + float minimum, + float maximum + ) +{ + //Global memory guard for padding + if((get_global_id(0) < width) && (get_global_id(1)<height)) + { + int idx = NB_COLOR* (get_global_id(0) + width * get_global_id(1)); + float mini_in, maxi_in, scale; + maxi_in = max_min_input[0]; + mini_in = max_min_input[1]; + if (maxi_in == mini_in) + { + scale = NAN; + } + else + { + scale = (maximum - minimum) / (maxi_in - mini_in); + } + + for (int c=0; c<NB_COLOR; c++) + { + output[idx+c] = fma(scale, image[idx+c]-mini_in, minimum); + } // end color loop + }//end if in IMAGE +}//end kernel diff --git a/src/silx/resources/opencl/image/max_min.cl b/src/silx/resources/opencl/image/max_min.cl new file mode 100644 index 0000000..246cd48 --- /dev/null +++ b/src/silx/resources/opencl/image/max_min.cl @@ -0,0 +1,207 @@ +/* + * Project: SILX: Data analysis library + * kernel for maximum and minimum calculation + * + * + * Copyright (C) 2013-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * + */ + +#ifndef NB_COLOR + #define NB_COLOR 1 +#endif + + + +#define REDUCE(a, b) ((float2) (fmax(a.x, b.x), fmin(a.y, b.y))) + +static float2 read_and_map(int idx, + global float* data) +{ + idx *= NB_COLOR; + float tmp = data[idx]; + float2 res = (float2) (tmp, tmp); + if (NB_COLOR > 1) + { + for (int c=1; c<NB_COLOR; c++) + { + tmp = data[idx + c]; + res = (float2) (fmax(res.x, tmp), fmin(res.y, tmp)); + } + } + return res; +} + + +/** + * \brief max_min_global_stage1: Look for the maximum an the minimum of an array. stage1 + * + * optimal workgroup size: 2^n greater than sqrt(size), limited to 512 + * optimal total item size: (workgroup size)^2 + * if size >total item size: adjust seq_count. + * + * :param data: Float pointer to global memory storing the vector of data. + * :param out: Float2 pointer to global memory storing the temporary results (workgroup size) + * :param seq_count: how many blocksize each thread should read + * :param size: size of the problem (number of element in the image + * :param l_data Shared memory: 2 float per thread in workgroup + * +**/ + + +kernel void max_min_reduction_stage1( global const float *data, + global float2 *out, + int size, + local float2 *l_data)// local storage 2 float per thread +{ + int group_size = get_local_size(0); + int lid = get_local_id(0); + float2 acc; + int big_block = group_size * get_num_groups(0); + int i = lid + group_size * get_group_id(0); + + if (lid<size) + acc = read_and_map(lid, data); + else + acc = read_and_map(0, data); + + // Linear pre-reduction stage 0 + + while (i<size){ + acc = REDUCE(acc, read_and_map(i, data)); + i += big_block; + } + + // parallel reduction stage 1 + + l_data[lid] = acc; + barrier(CLK_LOCAL_MEM_FENCE); + for (int block=group_size/2; block>1; block/=2) + { + if ((lid < block) && ((lid + block)<group_size)){ + l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (lid == 0) + { + if (group_size > 1) + { + acc = REDUCE(l_data[0], l_data[1]); + } + else + { + acc = l_data[0]; + } + out[get_group_id(0)] = acc; + } +} + + +/** + * \brief global_max_min: Look for the maximum an the minimum of an array. + * + * + * + * :param data2: Float2 pointer to global memory storing the vector of pre-reduced data (workgroup size). + * :param maximum: Float pointer to global memory storing the maximum value + * :param minumum: Float pointer to global memory storing the minimum value + * :param l_data Shared memory: 2 float per thread in workgroup + * +**/ + +kernel void max_min_reduction_stage2( + global const float2 *data2, + global float2 *maxmin, + local float2 *l_data)// local storage 2 float per thread +{ + int lid = get_local_id(0); + int group_size = get_local_size(0); + float2 acc = (float2)(-1.0f, -1.0f); + if (lid<=group_size) + { + l_data[lid] = data2[lid]; + } + else + { + l_data[lid] = acc; + } + + // parallel reduction stage 2 + + + barrier(CLK_LOCAL_MEM_FENCE); + for (int block=group_size/2; block>1; block/=2) + { + if ((lid < block) && ((lid + block)<group_size)) + { + l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + } + + if (lid == 0 ) + { + if ( group_size > 1) + { + acc = REDUCE(l_data[0], l_data[1]); + } + else + { + acc = l_data[0]; + } + maxmin[0] = acc; + } +} + +/*This is the serial version of the min_max kernel. + * + * It has to be launched with WG=1 and only 1 WG has to be launched ! + * + * :param data: Float pointer to global memory storing the vector of data. + * :param size: size of the + * :param maximum: Float pointer to global memory storing the maximum value + * :param minumum: Float pointer to global memory storing the minimum value + * + * + */ +kernel void max_min_serial( + global const float *data, + unsigned int size, + global float2 *maxmin + ) +{ + float2 acc = read_and_map(0, data); + for (int i=1; i<size; i++) + { + acc = REDUCE(acc, read_and_map(i, data)); + } + + maxmin[0] = acc; +} diff --git a/src/silx/resources/opencl/kahan.cl b/src/silx/resources/opencl/kahan.cl new file mode 100644 index 0000000..c23d84d --- /dev/null +++ b/src/silx/resources/opencl/kahan.cl @@ -0,0 +1,143 @@ +/* + * OpenCL library for 32-bits floating point calculation using compensated arithmetics + */ + +/* Nota: i386 computer use x87 registers which are larger than the 32bits precision + * which can invalidate the error compensation mechanism. + * + * We use the trick to declare some variable "volatile" to enforce the actual + * precision reduction of those variables. +*/ + +#ifndef X87_VOLATILE +# define X87_VOLATILE +#endif + +// calculate acc.s0+value with error compensation +// see https://en.wikipedia.org/wiki/Kahan_summation_algorithm +// unlike wikipedia, here the exact value = acc.s0 + acc.s1 (performed in higher precision) + +static inline float2 kahan_sum(float2 acc, float value) +{ + if (value) + { + float sum = acc.s0; + float err = acc.s1; + if (fabs(value) > fabs(sum)) + { + float tmp = sum; + sum = value; + value = tmp; + } + + float cor = value + err; + X87_VOLATILE float target = sum + cor; + err = cor - (target - sum); + return (float2)(target, err); + } + else + { + return acc; + } +} + +// calculate a*b with error compensation +// see https://hal.archives-ouvertes.fr/hal-01367769/document +static inline float2 comp_prod(float a, float b) +{ + X87_VOLATILE float x = a * b; + float y = fma(a, b, -x); + return (float2)(x, y); +} + +// calculate a + b with error compensation +static inline float2 compensated_sum(float2 a, float2 b) +{ + float err = a.s1 + b.s1; + float first = a.s0; + float second = b.s0; + if (fabs(second) > fabs(first)) + {//swap first and second + float tmp = first; + first = second; + second = tmp; + } + float cor = second + err; + X87_VOLATILE float target = first + cor; + err = cor - (target - first); + return (float2)(target, err); +} + +// calculate a * b with error compensation +static inline float2 compensated_mul(float2 a, float2 b) +{ + float2 tmp; + tmp = kahan_sum((float2)(a.s0*b.s0, a.s0*b.s1), a.s1*b.s0); + tmp = kahan_sum(tmp, a.s1*b.s1); + return tmp; +} + +// calculate 1/a with error compensation (Needs validation) +static inline float2 compensated_inv(float2 a) +{ + float main = a.s0; + float err = a.s1; + return (float2)(1.0f/main, -err/(main*main)); +} + +// calculate a/b with error compensation (Needs validation) +static inline float2 compensated_div(float2 a, float2 b) +{ + float ah = a.s0; + float al = a.s1; + float bh = b.s0; + float bl = b.s1; + float bl_over_bh = bl/bh; + return kahan_sum(kahan_sum(a, -a.s1 * bl_over_bh), -a.s0 * bl_over_bh) / bh; +} + + +// calculate a.b where a and b are float2 +static inline float2 comp_dot2(float2 a, float2 b) +{ + return compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)); +} + +// calculate a.b where a and b are float3 +static inline float2 comp_dot3(float3 a, float3 b) +{ + return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), comp_prod(a.s2, b.s2)); +} + +// calculate a.b where a and b are float4 +static inline float2 comp_dot4(float4 a, float4 b) +{ + return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), + compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))); +} + +// calculate a.b where a and b are float8 +static inline float2 comp_dot8(float8 a, float8 b) +{ + return compensated_sum( + compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), + compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))), + compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)), + compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7)))); +} + +// calculate a.b where a and b are float8 +static inline float2 comp_dot16(float16 a, float16 b) +{ + return compensated_sum( + compensated_sum( + compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), + compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))), + compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)), + compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7)))), + compensated_sum( + compensated_sum(compensated_sum(comp_prod(a.s8, b.s8), comp_prod(a.s9, b.s9)), + compensated_sum(comp_prod(a.sa, b.sa), comp_prod(a.sb, b.sb))), + compensated_sum(compensated_sum(comp_prod(a.sc, b.sc), comp_prod(a.sd, b.sd)), + compensated_sum(comp_prod(a.se, b.se), comp_prod(a.sf, b.sf))))); +} diff --git a/src/silx/resources/opencl/linalg.cl b/src/silx/resources/opencl/linalg.cl new file mode 100644 index 0000000..8710528 --- /dev/null +++ b/src/silx/resources/opencl/linalg.cl @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +/** + * + * Compute the spatial gradient of an image. + * + * slice: input image + * slice_grad: output gradient + * sizeX: number of columns of the image + * sizeY: number of rows of the image + * + **/ +kernel void kern_gradient2D( + global float* slice, + global float2* slice_grad, + int sizeX, + int sizeY) +{ + + int gidx = (int) get_global_id(0); + int gidy = (int) get_global_id(1); + + if ((gidx < sizeX) && (gidy < sizeY)) + { + // Note the direction inconstancy ! (JK 07/2018) + + float val_y = (gidx == (sizeX-1))? 0: slice[gidy*sizeX+gidx+1] - slice[gidy*sizeX+gidx]; + float val_x = (gidy == (sizeY-1))? 0: slice[(gidy+1)*sizeX+gidx] - slice[(gidy)*sizeX+gidx]; + + slice_grad[gidy*sizeX+gidx].x = val_x; + slice_grad[gidy*sizeX+gidx].y = val_y; + } +} + +/** + * + * Compute the spatial divergence of an image gradient. + * + * slice_grad: input gradient-like image + * slice: output image + * sizeX: number of columns of the input + * sizeY: number of rows of the input + * + **/ +kernel void kern_divergence2D( + global float2* slice_grad, + global float* slice, + int sizeX, + int sizeY) +{ + int gidx = (int) get_global_id(0); + int gidy = (int) get_global_id(1); + + if (gidx < sizeX && gidy < sizeY) + { + float val_x, val_y; + val_y = (gidx == 0)? + slice_grad[(gidy)*sizeX+gidx].y : + slice_grad[(gidy)*sizeX+gidx].y - slice_grad[(gidy)*sizeX+gidx-1].y; + val_x = (gidy == 0)? + slice_grad[(gidy)*sizeX+gidx].x: + slice_grad[(gidy)*sizeX+gidx].x - slice_grad[(gidy-1)*sizeX+gidx].x; + slice[gidy*sizeX+gidx] = val_x + val_y; + } +} diff --git a/src/silx/resources/opencl/medfilt.cl b/src/silx/resources/opencl/medfilt.cl new file mode 100644 index 0000000..0680230 --- /dev/null +++ b/src/silx/resources/opencl/medfilt.cl @@ -0,0 +1,141 @@ +/* + * Project: Azimuthal regroupping OpenCL kernel for PyFAI. + * Median filter for 1D, 2D and 3D datasets, only 2D for now + * + * + * Copyright (C) 2017-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * Last revision: 07/02/2017 + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +/* + * Needs to be concatenated with bitonic.cl prior to compilation +*/ + +/* + * Perform the 2D median filtering of an image 2D image + * + * dim0 => wg=number_of_element in the tile /8 + * dim1 = x: wg=1 + + * + * Actually the workgoup size is a bit more complicated: + * if window size = 1,3,5,7: WG1=8 + * if window size = 9,11,13,15: WG1=32 + * if window size = 17, ...,21: WG1=64 + * + * More Generally the workgroup size must be: at least: kfs2*(kfs1+7)/8 + * Each thread treats 8 values aligned vertically, this allows (almost) + * coalesced reading between threads in one line of the tile. + * + * Later on it will be more efficient to re-use the same tile + * and slide it vertically by one line. + * The additionnal need for shared memory will be kfs2 floats and a float8 as register. + * + * Theoritically, it should be possible to handle up to windows-size 83x83 + */ +__kernel void medfilt2d(__global float *image, // input image + __global float *result, // output array + __local float4 *l_data,// local storage 4x the number of threads + int khs1, // Kernel half-size along dim1 (nb lines) + int khs2, // Kernel half-size along dim2 (nb columns) + int height, // Image size along dim1 (nb lines) + int width) // Image size along dim2 (nb columns) +{ + int threadid = get_local_id(0); + //int wg = get_local_size(0); + int x = get_global_id(1); + + if (x < width) + { + union + { + float ary[8]; + float8 vec; + } output, input; + input.vec = (float8)(MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT); + int kfs1 = 2 * khs1 + 1; //definition of kernel full size + int kfs2 = 2 * khs2 + 1; + int nbands = (kfs1 + 7) / 8; // 8 elements per thread, aligned vertically in 1 column + for (int y=0; y<height; y++) + { + //Select only the active threads, some may remain inactive + int nb_threads = (nbands * kfs2); + int band_nr = threadid / kfs2; + int band_id = threadid % kfs2; + int pos_x = clamp((int)(x + band_id - khs2), (int) 0, (int) width-1); + int max_vec = clamp(kfs1 - 8 * band_nr, 0, 8); + if (y == 0) + { + for (int i=0; i<max_vec; i++) + { + if (threadid<nb_threads) + { + int pos_y = clamp((int)(y + 8 * band_nr + i - khs1), (int) 0, (int) height-1); + input.ary[i] = image[pos_x + width * pos_y]; + } + } + } + else + { + //store storage.s0 to some shared memory to retrieve it from another thread. + l_data[threadid].s0 = input.vec.s0; + + //Offset to the bottom + input.vec = (float8)(input.vec.s1, + input.vec.s2, + input.vec.s3, + input.vec.s4, + input.vec.s5, + input.vec.s6, + input.vec.s7, + MAXFLOAT); + + barrier(CLK_LOCAL_MEM_FENCE); + + int read_from = threadid + kfs2; + if (read_from < nb_threads) + input.vec.s7 = l_data[read_from].s0; + else if (threadid < nb_threads) //we are on the last band + { + int pos_y = clamp((int)(y + 8 * band_nr + max_vec - 1 - khs1), (int) 0, (int) height-1); + input.ary[max_vec - 1] = image[pos_x + width * pos_y]; + } + + } + + //This function is defined in bitonic.cl + output.vec = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), + input.vec, l_data); + + size_t target = (kfs1 * kfs2) / 2; + if (threadid == (target / 8)) + { + result[y * width + x] = output.ary[target % 8]; + } + + } + } +} + diff --git a/src/silx/resources/opencl/preprocess.cl b/src/silx/resources/opencl/preprocess.cl new file mode 100644 index 0000000..de35c48 --- /dev/null +++ b/src/silx/resources/opencl/preprocess.cl @@ -0,0 +1,567 @@ +/* + * Project: Azimuthal regroupping OpenCL kernel for PyFAI. + * Preprocessing program + * + * + * Copyright (C) 2012-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * Last revision: 19/01/2017 + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/** + * \file + * + * \brief OpenCL kernels for image array casting, array mem-setting and normalizing + * + * Constant to be provided at build time: + * NIMAGE: size of the image + * + */ + +#include "for_eclipse.h" + +/** + * \brief cast values of an array of int8 into a float output array. + * + * - array_s8: Pointer to global memory with the input data as signed8 array + * - array_float: Pointer to global memory with the output data as float array + */ +__kernel void +s8_to_float(__global char *array_s8, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)array_s8[i]; +} + + +/** + * \brief cast values of an array of uint8 into a float output array. + * + * - array_u8: Pointer to global memory with the input data as unsigned8 array + * - array_float: Pointer to global memory with the output data as float array + */ +__kernel void +u8_to_float(__global unsigned char *array_u8, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)array_u8[i]; +} + + +/** + * \brief cast values of an array of int16 into a float output array. + * + * - array_s16: Pointer to global memory with the input data as signed16 array + * - array_float: Pointer to global memory with the output data as float array + */ +__kernel void +s16_to_float(__global short *array_s16, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)array_s16[i]; +} + + +/** + * \brief cast values of an array of uint16 into a float output array. + * + * - array_u16: Pointer to global memory with the input data as unsigned16 array + * - array_float: Pointer to global memory with the output data as float array + */ +__kernel void +u16_to_float(__global unsigned short *array_u16, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)array_u16[i]; +} + +/** + * \brief cast values of an array of uint32 into a float output array. + * + * - array_u32: Pointer to global memory with the input data as unsigned32 array + * - array_float: Pointer to global memory with the output data as float array + */ +__kernel void +u32_to_float(__global unsigned int *array_u32, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)array_u32[i]; +} + +/** + * \brief convert values of an array of int32 into a float output array. + * + * - array_int: Pointer to global memory with the data as unsigned32 array + * - array_float: Pointer to global memory with the data float array + */ +__kernel void +s32_to_float(__global int *array_int, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)(array_int[i]); +} + + +/** + * Functions to be called from an actual kernel. + * \brief Performs the normalization of input image by dark subtraction, + * variance is propagated to second member of the float3 + * flatfield, solid angle, polarization and absorption are stored in + * third member of the float3 returned. + * + * Invalid/Dummy pixels will all have the third-member set to 0, i.e. no weight + * + * - image Float pointer to global memory storing the input image. + * - do_dark Bool/int: shall dark-current correction be applied ? + * - dark Float pointer to global memory storing the dark image. + * - do_flat Bool/int: shall flat-field correction be applied ? + * - flat Float pointer to global memory storing the flat image. + * - do_solidangle Bool/int: shall flat-field correction be applied ? + * - solidangle Float pointer to global memory storing the solid angle of each pixel. + * - do_polarization Bool/int: shall polarization correction be applied ? + * - polarization Float pointer to global memory storing the polarization of each pixel. + * - do_absorption Bool/int: shall absorption correction be applied ? + * - absorption Float pointer to global memory storing the effective absoption of each pixel. + * - do_mask perform mask correction ? + * - mask Bool/char pointer to mask array + * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored + * - dummy Float: value for bad pixels + * - delta_dummy Float: precision for bad pixel value + * - normalization_factor : divide the input by this value + * +**/ + +static float3 _preproc3(const __global float *image, + const __global float *variance, + const char do_dark, + const __global float *dark, + const char do_dark_variance, + const __global float *dark_variance, + const char do_flat, + const __global float *flat, + const char do_solidangle, + const __global float *solidangle, + const char do_polarization, + const __global float *polarization, + const char do_absorption, + const __global float *absorption, + const char do_mask, + const __global char *mask, + const char do_dummy, + const float dummy, + const float delta_dummy, + const float normalization_factor) +{ + size_t i= get_global_id(0); + float3 result = (float3)(0.0, 0.0, 0.0); + if (i < NIMAGE) + { + if ((!do_mask) || (!mask[i])) + { + result.s0 = image[i]; + if (variance != 0) + result.s1 = variance[i]; + result.s2 = normalization_factor; + if ( (!do_dummy) + ||((delta_dummy != 0.0f) && (fabs(result.s0-dummy) > delta_dummy)) + ||((delta_dummy == 0.0f) && (result.s0 != dummy))) + { + if (do_dark) + result.s0 -= dark[i]; + if (do_dark_variance) + result.s1 += dark_variance[i]; + if (do_flat) + { + float one_flat = flat[i]; + if ( (!do_dummy) + ||((delta_dummy != 0.0f) && (fabs(one_flat-dummy) > delta_dummy)) + ||((delta_dummy == 0.0f) && (one_flat != dummy))) + result.s2 *= one_flat; + else + result.s2 = 0.0f; + } + if (do_solidangle) + result.s2 *= solidangle[i]; + if (do_polarization) + result.s2 *= polarization[i]; + if (do_absorption) + result.s2 *= absorption[i]; + if (isnan(result.s0) || isnan(result.s1) || isnan(result.s2) || (result.s2 == 0.0f)) + result = (float3)(0.0, 0.0, 0.0); + } + else + { + result = (float3)(0.0, 0.0, 0.0); + }//end if do_dummy + } // end if mask + };//end if NIMAGE + return result; +};//end function + + +/** + * \brief Performs the normalization of input image by dark subtraction, + * flatfield, solid angle, polarization and absorption division. + * + * Intensities of images are corrected by: + * - dark (read-out) noise subtraction + * - Solid angle correction (division) + * - polarization correction (division) + * - flat fiels correction (division) + * Corrections are made in place unless the pixel is dummy. + * Dummy pixels are left untouched so that they remain dummy + * + * - image Float pointer to global memory storing the input image. + * - do_dark Bool/int: shall dark-current correction be applied ? + * - dark Float pointer to global memory storing the dark image. + * - do_flat Bool/int: shall flat-field correction be applied ? + * - flat Float pointer to global memory storing the flat image. + * - do_solidangle Bool/int: shall flat-field correction be applied ? + * - solidangle Float pointer to global memory storing the solid angle of each pixel. + * - do_polarization Bool/int: shall polarization correction be applied ? + * - polarization Float pointer to global memory storing the polarization of each pixel. + * - do_absorption Bool/int: shall absorption correction be applied ? + * - absorption Float pointer to global memory storing the effective absoption of each pixel. + * - do_mask perform mask correction ? + * - mask Bool/char pointer to mask array + * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored + * - dummy Float: value for bad pixels + * - delta_dummy Float: precision for bad pixel value + * - normalization_factor : divide the input by this value + * +**/ + +__kernel void +corrections(const __global float *image, + const char do_dark, + const __global float *dark, + const char do_flat, + const __global float *flat, + const char do_solidangle, + const __global float *solidangle, + const char do_polarization, + const __global float *polarization, + const char do_absorption, + const __global float *absorption, + const char do_mask, + const __global char *mask, + const char do_dummy, + const float dummy, + const float delta_dummy, + const float normalization_factor, + __global float *output + ) +{ + size_t i= get_global_id(0); + float3 result = (float3)(0.0, 0.0, 0.0); + if (i < NIMAGE) + { + result = _preproc3(image, + 0, + do_dark, + dark, + 0, + 0, + do_flat, + flat, + do_solidangle, + solidangle, + do_polarization, + polarization, + do_absorption, + absorption, + do_mask, + mask, + do_dummy, + dummy, + delta_dummy, + normalization_factor); + if (result.s2 != 0.0f) + output[i] = result.s0 / result.s2; + else + output[i] = dummy; + };//end if NIMAGE + +};//end kernel + + +/** + * \brief Performs Normalization of input image with float2 output (num,denom) + * + * Intensities of images are corrected by: + * - dark (read-out) noise subtraction for the data + * - Solid angle correction (denominator) + * - polarization correction (denominator) + * - flat fiels correction (denominator) + * + * Corrections are made out of place. + * Dummy pixels set both the numerator and denominator to 0 + * + * - image Float pointer to global memory storing the input image. + * - do_dark Bool/int: shall dark-current correction be applied ? + * - dark Float pointer to global memory storing the dark image. + * - do_flat Bool/int: shall flat-field correction be applied ? + * - flat Float pointer to global memory storing the flat image. + * - do_solidangle Bool/int: shall flat-field correction be applied ? + * - solidangle Float pointer to global memory storing the solid angle of each pixel. + * - do_polarization Bool/int: shall flat-field correction be applied ? + * - polarization Float pointer to global memory storing the polarization of each pixel. + * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored + * - dummy Float: value for bad pixels + * - delta_dummy Float: precision for bad pixel value + * - normalization_factor : divide the input by this value + * + * +**/ +__kernel void +corrections2(const __global float *image, + const char do_dark, + const __global float *dark, + const char do_flat, + const __global float *flat, + const char do_solidangle, + const __global float *solidangle, + const char do_polarization, + const __global float *polarization, + const char do_absorption, + const __global float *absorption, + const char do_mask, + const __global char *mask, + const char do_dummy, + const float dummy, + const float delta_dummy, + const float normalization_factor, + __global float2 *output + ) +{ + size_t i= get_global_id(0); + float3 result = (float3)(0.0, 0.0, 0.0); + if (i < NIMAGE) + { + result = _preproc3(image, + 0, + do_dark, + dark, + 0, + 0, + do_flat, + flat, + do_solidangle, + solidangle, + do_polarization, + polarization, + do_absorption, + absorption, + do_mask, + mask, + do_dummy, + dummy, + delta_dummy, + normalization_factor); + output[i] = (float2)(result.s0, result.s2); + };//end if NIMAGE +};//end kernel + +/** + * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) assuming poissonian signal + * + * Intensities of images are corrected by: + * - dark (read-out) noise subtraction for the data + * - Solid angle correction (denominator) + * - polarization correction (denominator) + * - flat fiels correction (denominator) + * + * Corrections are made out of place. + * Dummy pixels set both the numerator and denominator to 0 + * + * - image Float pointer to global memory storing the input image. + * - do_dark Bool/int: shall dark-current correction be applied ? + * - dark Float pointer to global memory storing the dark image. + * - do_flat Bool/int: shall flat-field correction be applied ? + * - flat Float pointer to global memory storing the flat image. + * - do_solidangle Bool/int: shall flat-field correction be applied ? + * - solidangle Float pointer to global memory storing the solid angle of each pixel. + * - do_polarization Bool/int: shall flat-field correction be applied ? + * - polarization Float pointer to global memory storing the polarization of each pixel. + * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored + * - dummy Float: value for bad pixels + * - delta_dummy Float: precision for bad pixel value + * - normalization_factor : divide the input by this value + * + * +**/ +__kernel void +corrections3Poisson( const __global float *image, + const char do_dark, + const __global float *dark, + const char do_flat, + const __global float *flat, + const char do_solidangle, + const __global float *solidangle, + const char do_polarization, + const __global float *polarization, + const char do_absorption, + const __global float *absorption, + const char do_mask, + const __global char *mask, + const char do_dummy, + const float dummy, + const float delta_dummy, + const float normalization_factor, + __global float3 *output + ) +{ + size_t i= get_global_id(0); + float3 result = (float3)(0.0, 0.0, 0.0); + if (i < NIMAGE) + { + result = _preproc3(image, + image, + do_dark, + dark, + do_dark, + dark, + do_flat, + flat, + do_solidangle, + solidangle, + do_polarization, + polarization, + do_absorption, + absorption, + do_mask, + mask, + do_dummy, + dummy, + delta_dummy, + normalization_factor); + output[i] = result; + };//end if NIMAGE +};//end kernel + + +/** + * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) + * + * Intensities of images are corrected by: + * - dark (read-out) noise subtraction for the data + * - Solid angle correction (division) + * - polarization correction (division) + * - flat fiels correction (division) + * Corrections are made in place unless the pixel is dummy. + * Dummy pixels are left untouched so that they remain dummy + * + * - image Float pointer to global memory storing the input image. + * - do_dark Bool/int: shall dark-current correction be applied ? + * - dark Float pointer to global memory storing the dark image. + * - do_flat Bool/int: shall flat-field correction be applied ? + * - flat Float pointer to global memory storing the flat image. + * - do_solidangle Bool/int: shall flat-field correction be applied ? + * - solidangle Float pointer to global memory storing the solid angle of each pixel. + * - do_polarization Bool/int: shall flat-field correction be applied ? + * - polarization Float pointer to global memory storing the polarization of each pixel. + * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored + * - dummy Float: value for bad pixels + * - delta_dummy Float: precision for bad pixel value + * - normalization_factor : divide the input by this value + * + * +**/ + +__kernel void +corrections3(const __global float *image, + const __global float *variance, + const char do_dark, + const __global float *dark, + const char do_dark_variance, + const __global float *dark_variance, + const char do_flat, + const __global float *flat, + const char do_solidangle, + const __global float *solidangle, + const char do_polarization, + const __global float *polarization, + const char do_absorption, + const __global float *absorption, + const char do_mask, + const __global char *mask, + const char do_dummy, + const float dummy, + const float delta_dummy, + const float normalization_factor, + __global float3 *output + ) +{ + size_t i= get_global_id(0); + float3 result = (float3)(0.0, 0.0, 0.0); + if (i < NIMAGE) + { + result = _preproc3( image, + variance, + do_dark, + dark, + do_dark_variance, + dark_variance, + do_flat, + flat, + do_solidangle, + solidangle, + do_polarization, + polarization, + do_absorption, + absorption, + do_mask, + mask, + do_dummy, + dummy, + delta_dummy, + normalization_factor); + output[i] = result; + };//end if NIMAGE +};//end kernel + + diff --git a/src/silx/resources/opencl/proj.cl b/src/silx/resources/opencl/proj.cl new file mode 100644 index 0000000..2a6d870 --- /dev/null +++ b/src/silx/resources/opencl/proj.cl @@ -0,0 +1,345 @@ +/* + * Copyright (C) 2017-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Based on the projector of PyHST2 - https://forge.epn-campus.eu/projects/pyhst2 + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +/*******************************************************************************/ +/************************ GPU VERSION (with textures) **************************/ +/*******************************************************************************/ + +#ifndef DONT_USE_TEXTURES +kernel void forward_kernel( + global float *d_Sino, + read_only image2d_t d_slice, + int dimslice, + int num_bins, + global float* angles_per_project , + float axis_position, + global float *d_axis_corrections, + global int* d_beginPos , + global int* d_strideJoseph, + global int* d_strideLine , + int num_projections, + int dimrecx, + int dimrecy, + float cpu_offset_x, + float cpu_offset_y, + int josephnoclip, + int normalize) +{ + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; + const int tidx = get_local_id(0); + const int bidx = get_group_id(0); + const int tidy = get_local_id(1); + const int bidy = get_group_id(1); + float angle; + float cos_angle,sin_angle ; + + local float corrections[16]; + local int beginPos[16*2]; + local int strideJoseph[16*2]; + local int strideLine[16*2]; + + // thread will use corrections[tidy] + // All are read by first warp + int offset, OFFSET; + switch(tidy) { + case 0: + corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx]; + break; + case 1: + case 2: + offset = 16*(tidy-1); + OFFSET = dimrecy*(tidy-1); + beginPos [offset + tidx ]= d_beginPos[ OFFSET+ bidy*16+tidx] ; + break; + case 3: + case 4: + offset = 16*(tidy-3); + OFFSET = dimrecy*(tidy-3); + strideJoseph[offset + tidx ]= d_strideJoseph[OFFSET + bidy*16+tidx] ; + break; + case 5: + case 6: + offset = 16*(tidy-5); + OFFSET = dimrecy*(tidy-5); + strideLine[offset + tidx ]= d_strideLine[OFFSET + bidy*16+tidx] ; + break; + } + barrier(CLK_LOCAL_MEM_FENCE); + + angle = angles_per_project[ bidy*16+tidy ] ; + cos_angle = cos(angle); + sin_angle = sin(angle); + + if(fabs(cos_angle) > 0.70710678f ) { + if( cos_angle>0) { + cos_angle = cos(angle); + sin_angle = sin(angle); + } + else { + cos_angle = -cos(angle); + sin_angle = -sin(angle); + } + } + else { + if( sin_angle>0) { + cos_angle = sin(angle); + sin_angle = -cos(angle); + } + else { + cos_angle = -sin(angle); + sin_angle = cos(angle); + } + } + float res=0.0f; + float axis_corr = axis_position + corrections[ tidy ]; + float axis = axis_position ; + float xpix = ( bidx*16+tidx )-cpu_offset_x; + float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ; + + float shiftJ = sin_angle/cos_angle; + float x1 = min(-sin_angle/cos_angle ,0.f); + float x2 = max(-sin_angle/cos_angle ,0.f); + + float Area; + Area=1.0f/cos_angle; + int stlA, stlB , stlAJ, stlBJ ; + stlA=strideLine[16+tidy]; + stlB=strideLine[tidy]; + stlAJ=strideJoseph[16+tidy]; + stlBJ=strideJoseph[tidy]; + + int beginA = beginPos[16+tidy ]; + int beginB = beginPos[tidy ]; + float add; + int l; + + if(josephnoclip) { + for(int j=0; j<dimslice; j++) { // j: Joseph + x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f; + x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f; + add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2); + res += add; + posx += shiftJ; + } + } + else { + for(int j=0; j<dimslice; j++) { // j: Joseph + x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f; + x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f; + l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ; + add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2); + res += add*l; + posx += shiftJ; + } + } + + if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) { + res *= Area; + if (normalize) + res *= M_PI_F * 0.5f / num_projections; + d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res; + } +} +#endif + + +/*******************************************************************************/ +/********************* CPU VERSION (without textures) **************************/ +/*******************************************************************************/ + + +kernel void forward_kernel_cpu( + global float *d_Sino, + global float* d_slice, + int dimslice, + int num_bins, + global float* angles_per_project , + float axis_position, + global float *d_axis_corrections, + global int* d_beginPos , + global int* d_strideJoseph, + global int* d_strideLine , + int num_projections, + int dimrecx, + int dimrecy, + float cpu_offset_x, + float cpu_offset_y, + int josephnoclip, + int normalize) +{ + + const int tidx = get_local_id(0); + const int bidx = get_group_id(0); + const int tidy = get_local_id(1); + const int bidy = get_group_id(1); + float angle; + float cos_angle,sin_angle ; + + local float corrections[16]; + local int beginPos[16*2]; + local int strideJoseph[16*2]; + local int strideLine[16*2]; + + // thread will use corrections[tidy] + // All are read by first warp + int offset, OFFSET; + switch(tidy) { + case 0: + corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx]; + break; + case 1: + case 2: + offset = 16*(tidy-1); + OFFSET = dimrecy*(tidy-1); + beginPos [offset + tidx ]= d_beginPos[ OFFSET+ bidy*16+tidx] ; + break; + case 3: + case 4: + offset = 16*(tidy-3); + OFFSET = dimrecy*(tidy-3); + strideJoseph[offset + tidx ]= d_strideJoseph[OFFSET + bidy*16+tidx] ; + break; + case 5: + case 6: + offset = 16*(tidy-5); + OFFSET = dimrecy*(tidy-5); + strideLine[offset + tidx ]= d_strideLine[OFFSET + bidy*16+tidx] ; + break; + } + barrier(CLK_LOCAL_MEM_FENCE); + + angle = angles_per_project[ bidy*16+tidy ] ; + cos_angle = cos(angle); + sin_angle = sin(angle); + + if(fabs(cos_angle) > 0.70710678f ) { + if( cos_angle>0) { + cos_angle = cos(angle); + sin_angle = sin(angle); + } + else { + cos_angle = -cos(angle); + sin_angle = -sin(angle); + } + } + else { + if( sin_angle>0) { + cos_angle = sin(angle); + sin_angle = -cos(angle); + } + else { + cos_angle = -sin(angle); + sin_angle = cos(angle); + } + } + float res=0.0f; + float axis_corr = axis_position + corrections[ tidy ]; + float axis = axis_position ; + float xpix = ( bidx*16+tidx )-cpu_offset_x; + float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ; + + float shiftJ = sin_angle/cos_angle; + float x1 = min(-sin_angle/cos_angle ,0.f); + float x2 = max(-sin_angle/cos_angle ,0.f); + + float Area; + Area=1.0f/cos_angle; + int stlA, stlB , stlAJ, stlBJ ; + stlA=strideLine[16+tidy]; + stlB=strideLine[tidy]; + stlAJ=strideJoseph[16+tidy]; + stlBJ=strideJoseph[tidy]; + + int beginA = beginPos[16+tidy ]; + int beginB = beginPos[tidy ]; + int l; + + int ym, yp, xm, xp; + float yc, xc; + float val; + if(josephnoclip) { + for(int j=0; j<dimslice; j++) { // j: Joseph + x1 = beginA +(posx)*stlA + (j)*stlAJ+1.0f; + x2 = beginB +(posx)*stlB + (j)*stlBJ+1.0f; + /* + Bilinear interpolation + */ + yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped + ym = (int) floor(yc); // y_minus + yp = (int) ceil(yc); // y_plus + + xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f)); // x_clipped + xm = (int) floor(xc); // x_minus + xp = (int) ceil(xc); // x_plus + + if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm]; + else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm)); + else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym)); + else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc)) + + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc)) + + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm)) + + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm)); + // ---------- + res += val; + posx += shiftJ; + } + } + else { + for(int j=0; j<dimslice; j++) { // j: Joseph + x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f; + x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f; + l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ; + /* + Bilinear interpolation + */ + yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped + ym = (int) floor(yc); // y_minus + yp = (int) ceil(yc); // y_plus + + xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f)); // x_clipped + xm = (int) floor(xc); // x_minus + xp = (int) ceil(xc); // x_plus + + if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm]; + else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm)); + else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym)); + else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc)) + + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc)) + + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm)) + + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm)); + // ---------- + res += val*l; + posx += shiftJ; + } + } + + if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) { + res *= Area; + if (normalize) + res *= M_PI_F * 0.5f / num_projections; + d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res; + } +} diff --git a/src/silx/resources/opencl/sparse.cl b/src/silx/resources/opencl/sparse.cl new file mode 100644 index 0000000..c99a0e9 --- /dev/null +++ b/src/silx/resources/opencl/sparse.cl @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2016-2019 European Synchrotron Radiation Facility + * Grenoble, France + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef IMAGE_WIDTH + #error "Please define IMAGE_WIDTH parameter" +#endif + +#ifndef DTYPE + #error "Please define DTYPE parameter" +#endif + +#ifndef IDX_DTYPE + #error "Please define IDX_DTYPE parameter" +#endif + + + +/** + * Densify a matric from CSR format to "dense" 2D format. + * The input CSR data consists in 3 arrays: (data, ind, iptr). + * The output array is a 2D array of dimensions IMAGE_WIDTH * image_height. + * + * data: 1D array containing the nonzero data items. + * ind: 1D array containing the column indices of the nonzero data items. + * iptr: 1D array containing indirection indices, such that range + * [iptr[i], iptr[i+1]-1] of "data" and "ind" contain the relevant data + * of output row "i". + * output: 2D array containing the densified data. + * image_height: height (number of rows) of the output data. +**/ + +kernel void densify_csr( + const global DTYPE* data, + const global IDX_DTYPE* ind, + const global IDX_DTYPE* iptr, + global DTYPE* output, + int image_height +) +{ + uint tid = get_local_id(0); + uint row_idx = get_global_id(1); + if ((tid >= IMAGE_WIDTH) || (row_idx >= image_height)) return; + + local DTYPE line[IMAGE_WIDTH]; + + // Memset + //~ #pragma unroll + for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) { + if (tid+k >= IMAGE_WIDTH) break; + line[tid+k] = 0.0f; + } + barrier(CLK_LOCAL_MEM_FENCE); + + + uint start = iptr[row_idx], end = iptr[row_idx+1]; + //~ #pragma unroll + for (int k = start; k < end; k += get_local_size(0)) { + // Current work group handles one line of the final array + // on the current line, write data[start+tid] at column index ind[start+tid] + if (k+tid < end) + line[ind[k+tid]] = data[k+tid]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // write the current line (shared mem) into the output array (global mem) + //~ #pragma unroll + for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) { + output[row_idx*IMAGE_WIDTH + tid+k] = line[tid+k]; + if (k+tid >= IMAGE_WIDTH) return; + } +} diff --git a/src/silx/resources/opencl/statistics.cl b/src/silx/resources/opencl/statistics.cl new file mode 100644 index 0000000..47d925b --- /dev/null +++ b/src/silx/resources/opencl/statistics.cl @@ -0,0 +1,283 @@ +/* + * Project: Silx statics calculation + * + * + * + * Copyright (C) 2012-2021 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * Last revision: 17/05/2021 + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/** + * \file + * + * \brief OpenCL kernels for min, max, mean and std calculation + * + * This module provides two functions to perform the `map` and the `reduce` + * to be used with pyopencl reduction to calculate in a single pass the minimum, + * maximum, sum, count, mean and standart deviation for an array. + * + * So beside the reduction mechanisme from pyopencl, this algorithm implementes equations from + * https://dbs.ifi.uni-heidelberg.de/files/Team/eschubert/publications/SSDBM18-covariance-authorcopy.pdf + * + * let A and B be 2 disjoint partition of all elements + * + * Omega_A = sum_{i \in A}(omaga_i) The sum of all weights + * V_A is the weighted sum of the signal over the partition + * VV_A is the weighted sum of deviation squarred + * + * With this the mean is V / Omega and the variance equals VV / omega. + * + * Redction operator performs: + * Omega_{AB} = Omega_A + Omega_B + * V_{AB} = V_A + V_B + * VV{AB} = VV_A + VV_B + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB}) + * + * To avoid any numerical degradation, the doubleword library is used to perform all floating point operations. + * + */ +#include "for_eclipse.h" + +/* \brief read a value at given position and initialize the float8 for the reduction + * + * The float8 returned contains: + * s0: minimum value + * s1: maximum value + * s2: Omega_h count number of valid pixels + * s3: Omega_l error associated to the count + * s4: V_h sum of signal + * s5: V_l error associated to the sum of signal + * s6: VVh variance*count + * s7: VVl error associated to variance*count + * + */ +static inline float8 map_statistics(global float* data, int position) +{ + float value = data[position]; + float8 result; + + if (isfinite(value)) + { + result = (float8)(value, value, 1.0f, 0.0f, value, 0.0f, 0.0f, 0.0f); + // min max cnt_h cnt_l sum_h sum_l M2_h M2_l + } + else + { + result = (float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f); + // min max cnt_h cnt_l sum_h sum_l M2_h M2_l + } + return result; +} + +/* \brief reduction function associated to the statistics. + * + * this is described in: + * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + * + * The float8 used here contain contains: + * s0: minimum value + * s1: maximum value + * s2: count number of valid pixels + * s3: count (error associated to) + * s4: sum of valid pixels + * s5: sum (error associated to) + * s6: M=variance*(count-1) + * s7: M=variance*(count-1) (error associated to) + * + */ + +static inline float8 reduce_statistics(float8 a, float8 b) +{ + float2 sum_a, sum_b, M_a, M_b, count_a, count_b; + float2 count, sum; + float2 M, delta, delta2, omega3; + //test on count + if (a.s2 == 0.0f) + { + return b; + } + else + { + count_a = (float2)(a.s2, a.s3); + sum_a = (float2)(a.s4, a.s5); + M_a = (float2)(a.s6, a.s7); + } + //test on count + if (b.s2 == 0.0f) + { + return a; + } + else + { + count_b = (float2)(b.s2, b.s3); + sum_b = (float2)(b.s4, b.s5); + M_b = (float2)(b.s6, b.s7); + } + // count = count_a + count_b + count = dw_plus_dw(count_a, count_b); + // sum = sum_a + sum_b + sum = dw_plus_dw(sum_a, sum_b); + + // M2 = M_a + M_b + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB}) + M = dw_plus_dw(M_a, M_b); + delta = dw_plus_dw(dw_times_dw(count_b, sum_a), + -dw_times_dw(count_a, sum_b)); + delta2 = dw_times_dw(delta, delta); + omega3 = dw_times_dw(count, dw_times_dw(count_a, count_b)); + M = dw_plus_dw(M, dw_div_dw(delta2, omega3)); + + float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1), + count.s0, count.s1, + sum.s0, sum.s1, + M.s0, M.s1); + return result; +} + +/* \brief reduction function associated to the statistics without compensated arithmetics. + * + * this is described in: + * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + * + * The float8 used here contain contains: + * s0: minimum value + * s1: maximum value + * s2: count number of valid pixels (major) + * s3: count number of valid pixels (minor) + * s4: sum of valid pixels (major) + * s5: sum of valid pixels (minor) + * s6: variance*count (major) + * s7: variance*count (minor) + * + */ + +static inline float8 reduce_statistics_simple(float8 a, float8 b) +{ + float sum_a, sum_b, M_a, M_b, count_a, count_b; + + //test on count + if (a.s2 == 0.0f) + { + return b; + } + else + { + count_a = a.s2; + sum_a = a.s4; + M_a = a.s6; + } + //test on count + if (b.s2 == 0.0f) + { + return a; + } + else + { + count_b = b.s2; + sum_b = b.s4; + M_b = b.s6; + } + float count = count_a + count_b; + float sum = sum_a + sum_b; + float delta = sum_a*count_b - sum_b*count_a; + float delta2 = delta * delta; + float M2 = M_a + M_b + delta2/(count*count_a*count_b); + float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1), + count, 0.0f, + sum, 0.0f, + M2, 0.0f); + return result; +} + + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + + +// unpack a double in two floats such as the sum of the two is the double number +static inline float2 unpack_double(double inp){ + float major = (float) inp; + float minor = (float) (inp - major); + return (float2)(major, minor); +} + +// pack two floats into a double +static inline double pack_double(float major, float minor){ + return (double)major + (double)minor; +} + +/* \brief reduction function associated to the statistics using double precision arithmetics + * + * this is described in: + * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + * + * The float8 used here contain contains: + * s0: minimum value + * s1: maximum value + * s2: count number of valid pixels (major) + * s3: count number of valid pixels (minor) + * s4: sum of valid pixels (major) + * s5: sum of valid pixels (minor) + * s6: variance*count (major) + * s7: variance*count (minor) + * + */ + +static inline float8 reduce_statistics_double(float8 a, float8 b) +{ + double sum_a, sum_b, M_a, M_b, count_a, count_b; + + //test on count + if (a.s2 == 0.0) + { + return b; + } + else + { + count_a = pack_double(a.s2, a.s3); + sum_a = pack_double(a.s4,a.s5); + M_a = pack_double(a.s6, a.s7); + } + //test on count + if (b.s2 == 0.0) + { + return a; + } + else + { + count_b = pack_double(b.s2, b.s3); + sum_b = pack_double(b.s4, b.s5); + M_b = pack_double(b.s6, b.s7); + } + double count = count_a + count_b; + double sum = sum_a + sum_b; + double delta = sum_a*count_b - sum_b*count_a; + double delta2 = delta * delta; + double M2 = M_a + M_b + delta2/(count*count_a*count_b); + float8 result = (float8)((float2)(min(a.s0, b.s0), max(a.s1, b.s1)), + unpack_double(count), + unpack_double( sum), + unpack_double( M2)); + return result; +} + +#endif
\ No newline at end of file |