summaryrefslogtreecommitdiff
path: root/src/silx/resources/opencl
diff options
context:
space:
mode:
Diffstat (limited to 'src/silx/resources/opencl')
-rw-r--r--src/silx/resources/opencl/addition.cl42
-rw-r--r--src/silx/resources/opencl/array_utils.cl73
-rw-r--r--src/silx/resources/opencl/backproj.cl232
-rw-r--r--src/silx/resources/opencl/backproj_helper.cl68
-rw-r--r--src/silx/resources/opencl/bitonic.cl569
-rw-r--r--src/silx/resources/opencl/codec/byte_offset.cl235
-rw-r--r--src/silx/resources/opencl/convolution.cl312
-rw-r--r--src/silx/resources/opencl/convolution_textures.cl374
-rw-r--r--src/silx/resources/opencl/doubleword.cl115
-rw-r--r--src/silx/resources/opencl/image/cast.cl181
-rw-r--r--src/silx/resources/opencl/image/histogram.cl178
-rw-r--r--src/silx/resources/opencl/image/map.cl85
-rw-r--r--src/silx/resources/opencl/image/max_min.cl207
-rw-r--r--src/silx/resources/opencl/kahan.cl143
-rw-r--r--src/silx/resources/opencl/linalg.cl88
-rw-r--r--src/silx/resources/opencl/medfilt.cl141
-rw-r--r--src/silx/resources/opencl/preprocess.cl567
-rw-r--r--src/silx/resources/opencl/proj.cl345
-rw-r--r--src/silx/resources/opencl/sparse.cl94
-rw-r--r--src/silx/resources/opencl/statistics.cl283
20 files changed, 4332 insertions, 0 deletions
diff --git a/src/silx/resources/opencl/addition.cl b/src/silx/resources/opencl/addition.cl
new file mode 100644
index 0000000..35d7996
--- /dev/null
+++ b/src/silx/resources/opencl/addition.cl
@@ -0,0 +1,42 @@
+/*
+ * Project: SIFT: An algorithm for image alignement
+ *
+ * Copyright (C) 2013-2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+// "Hello_world" kernel to test if OpenCL is actually working
+kernel void addition(global float* a,
+ global float* b,
+ global float* res,
+ int N)
+{
+ int i = get_global_id(0);
+ if( i<N ){
+ res[i] = a[i] + b[i];
+ }
+}
diff --git a/src/silx/resources/opencl/array_utils.cl b/src/silx/resources/opencl/array_utils.cl
new file mode 100644
index 0000000..6f78921
--- /dev/null
+++ b/src/silx/resources/opencl/array_utils.cl
@@ -0,0 +1,73 @@
+/**
+ * 2D Memcpy for float* arrays,
+ * replacing pyopencl "enqueue_copy" which does not work for rectangular copies.
+ * ALL THE SIZES/OFFSETS ARE SPECIFIED IN PIXELS, NOT IN BYTES.
+ * In the (x, y) convention, x is the fast index (as in CUDA).
+ *
+ * :param dst: destination array
+ * :param src: source array
+ * :param dst_width: width of the dst array
+ * :param src_width: width of the src array
+ * :param dst_offset: tuple with the offset (x, y) in the dst array
+ * :param src_offset: tuple with the offset (x, y) in the src array
+ * :param transfer_shape: shape of the transfer array in the form (x, y)
+ *
+ */
+kernel void cpy2d(
+ global float* dst,
+ global float* src,
+ int dst_width,
+ int src_width,
+ int2 dst_offset,
+ int2 src_offset,
+ int2 transfer_shape)
+{
+ int gidx = get_global_id(0), gidy = get_global_id(1);
+ if (gidx < transfer_shape.x && gidy < transfer_shape.y) {
+ dst[(dst_offset.y + gidy)*dst_width + (dst_offset.x + gidx)] = src[(src_offset.y + gidy)*src_width + (src_offset.x + gidx)];
+ }
+}
+
+
+// Looks like cfloat_t and cfloat_mul are not working, yet specified in
+// pyopencl documentation. Here we are using float2 as in all available examples
+// #include <pyopencl-complex.h>
+// typedef cfloat_t complex;
+
+static inline float2 complex_mul(float2 a, float2 b) {
+ float2 res = (float2) (0, 0);
+ res.x = a.x * b.x - a.y * b.y;
+ res.y = a.y * b.x + a.x * b.y;
+ return res;
+}
+
+// arr2D *= arr1D (line by line, i.e along fast dim)
+kernel void inplace_complex_mul_2Dby1D(
+ global float2* arr2D,
+ global float2* arr1D,
+ int width,
+ int height)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if ((x >= width) || (y >= height)) return;
+ int i = y*width + x;
+ arr2D[i] = complex_mul(arr2D[i], arr1D[x]);
+}
+
+
+// arr3D *= arr1D (along fast dim)
+kernel void inplace_complex_mul_3Dby1D(
+ global float2* arr3D,
+ global float2* arr1D,
+ int width,
+ int height,
+ int depth)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+ if ((x >= width) || (y >= height) || (z >= depth)) return;
+ int i = (z*height + y)*width + x;
+ arr3D[i] = complex_mul(arr3D[i], arr1D[x]);
+}
diff --git a/src/silx/resources/opencl/backproj.cl b/src/silx/resources/opencl/backproj.cl
new file mode 100644
index 0000000..da15131
--- /dev/null
+++ b/src/silx/resources/opencl/backproj.cl
@@ -0,0 +1,232 @@
+/*
+ * Project: silx: filtered backprojection
+ *
+ * Copyright (C) 2016-2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: A. Mirone
+ * P. Paleo
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/*******************************************************************************/
+/************************ GPU VERSION (with textures) **************************/
+/*******************************************************************************/
+
+#ifndef DONT_USE_TEXTURES
+kernel void backproj_kernel(
+ int num_proj,
+ int num_bins,
+ float axis_position,
+ global float *d_SLICE,
+ read_only image2d_t d_sino,
+ float gpu_offset_x,
+ float gpu_offset_y,
+ global float * d_cos_s, // precalculated cos(theta[i])
+ global float* d_sin_s, // precalculated sin(theta[i])
+ global float* d_axis_s, // array of axis positions (n_projs)
+ local float* shared2) // 768B of local mem
+{
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+ const int tidx = get_local_id(0); //threadIdx.x;
+ const int bidx = get_group_id(0); //blockIdx.x;
+ const int tidy = get_local_id(1); //threadIdx.y;
+ const int bidy = get_group_id(1); //blockIdx.y;
+
+ local float sh_cos[256];
+ local float sh_sin[256];
+ local float sh_axis[256];
+
+ float pcos, psin;
+ float h0, h1, h2, h3;
+ const float apos_off_x= gpu_offset_x - axis_position ;
+ const float apos_off_y= gpu_offset_y - axis_position ;
+ float acorr05;
+ float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
+
+ const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ;
+ const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ;
+
+ int read=0;
+ for(int proj=0; proj<num_proj; proj++) {
+ if(proj>=read) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ int ip = tidy*16+tidx;
+ if( read+ip < num_proj) {
+ sh_cos [ip] = d_cos_s[read+ip] ;
+ sh_sin [ip] = d_sin_s[read+ip] ;
+ sh_axis[ip] = d_axis_s[read+ip] ;
+ }
+ read=read+256; // 256=16*16 block size
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ pcos = sh_cos[256-read + proj] ;
+ psin = sh_sin[256-read + proj] ;
+
+ acorr05 = sh_axis[256 - read + proj] ;
+
+ h0 = (acorr05 + bx00*pcos - by00*psin);
+ h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
+ h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
+ h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
+
+ if(h0>=0 && h0<num_bins) res0 += read_imagef(d_sino, sampler, (float2) (h0 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h0 +0.5f,proj +0.5f);
+ if(h1>=0 && h1<num_bins) res1 += read_imagef(d_sino, sampler, (float2) (h1 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h1 +0.5f,proj +0.5f);
+ if(h2>=0 && h2<num_bins) res2 += read_imagef(d_sino, sampler, (float2) (h2 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h2 +0.5f,proj +0.5f);
+ if(h3>=0 && h3<num_bins) res3 += read_imagef(d_sino, sampler, (float2) (h3 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h3 +0.5f,proj +0.5f);
+ }
+ d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
+ d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
+ d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
+ d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
+}
+#endif
+
+
+
+
+/*******************************************************************************/
+/********************* CPU VERSION (without textures) **************************/
+/*******************************************************************************/
+
+
+#define CLIP_MAX(x, N) (fmin(fmax(x, 0.0f), (N - 1.0f)))
+
+#define FLOORCEIL_x(x) {\
+ xm = (int) floor(x);\
+ xp = (int) ceil(x);\
+}
+
+#define ADJACENT_PIXELS_VALS(arr, Nx, y, xm, xp) ((float2) (arr[y*Nx+xm], arr[y*Nx+xp]))
+
+//Simple linear interpolator for working on the GPU
+static float linear_interpolation(float2 vals,
+ float x,
+ int xm,
+ int xp)
+{
+ if (xm == xp)
+ return vals.s0;
+ else
+ return (vals.s0 * (xp - x)) + (vals.s1 * (x - xm));
+}
+
+/**
+ *
+ * Same kernel as backproj_kernel, but targets the CPU (no texture)
+ *
+**/
+kernel void backproj_cpu_kernel(
+ int num_proj,
+ int num_bins,
+ float axis_position,
+ global float *d_SLICE,
+ global float* d_sino,
+ float gpu_offset_x,
+ float gpu_offset_y,
+ global float * d_cos_s, // precalculated cos(theta[i])
+ global float * d_sin_s, // precalculated sin(theta[i])
+ global float * d_axis_s, // array of axis positions (n_projs)
+ local float* shared2) // 768B of local mem
+{
+ const int tidx = get_local_id(0); //threadIdx.x;
+ const int bidx = get_group_id(0); //blockIdx.x;
+ const int tidy = get_local_id(1); //threadIdx.y;
+ const int bidy = get_group_id(1); //blockIdx.y;
+
+ local float sh_cos[256];
+ local float sh_sin[256];
+ local float sh_axis[256];
+
+ float pcos, psin;
+ float h0, h1, h2, h3;
+ const float apos_off_x= gpu_offset_x - axis_position ;
+ const float apos_off_y= gpu_offset_y - axis_position ;
+ float acorr05;
+ float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
+
+ const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ;
+ const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ;
+
+ int read=0;
+ for(int proj=0; proj<num_proj; proj++) {
+ if(proj>=read) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ int ip = tidy*16+tidx;
+ if( read+ip < num_proj) {
+ sh_cos [ip] = d_cos_s[read+ip] ;
+ sh_sin [ip] = d_sin_s[read+ip] ;
+ sh_axis[ip] = d_axis_s[read+ip] ;
+ }
+ read=read+256; // 256=16*16 block size
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ pcos = sh_cos[256-read + proj] ;
+ psin = sh_sin[256-read + proj] ;
+
+ acorr05 = sh_axis[256 - read + proj] ;
+
+ h0 = (acorr05 + bx00*pcos - by00*psin);
+ h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
+ h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
+ h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
+
+
+ float x;
+ int ym, xm, xp;
+ ym = proj;
+ float2 vals;
+
+ if(h0>=0 && h0<num_bins) {
+ x = CLIP_MAX(h0, num_bins);
+ FLOORCEIL_x(x);
+ vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+ res0 += linear_interpolation(vals, x, xm, xp);
+ }
+ if(h1>=0 && h1<num_bins) {
+ x = CLIP_MAX(h1, num_bins);
+ FLOORCEIL_x(x);
+ vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+ res1 += linear_interpolation(vals, x, xm, xp);
+ }
+ if(h2>=0 && h2<num_bins) {
+ x = CLIP_MAX(h2, num_bins);
+ FLOORCEIL_x(x);
+ vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+ res2 += linear_interpolation(vals, x, xm, xp);
+ }
+ if(h3>=0 && h3<num_bins) {
+ x = CLIP_MAX(h3, num_bins);
+ FLOORCEIL_x(x);
+ vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
+ res3 += linear_interpolation(vals, x, xm, xp);
+ }
+ }
+ d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
+ d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
+ d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
+ d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
+}
+
diff --git a/src/silx/resources/opencl/backproj_helper.cl b/src/silx/resources/opencl/backproj_helper.cl
new file mode 100644
index 0000000..b1590f8
--- /dev/null
+++ b/src/silx/resources/opencl/backproj_helper.cl
@@ -0,0 +1,68 @@
+/*
+ * Project: silx: backprojection helper functions
+ *
+ * Copyright (C) 2016-2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: P. Paleo
+ * J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+kernel void mult( global float2* d_sino,
+ global float2* d_filter,
+ int num_bins,
+ int num_projs)
+{
+ int gid0 = get_global_id(0);
+ int gid1 = get_global_id(1);
+ if (gid0 < num_bins && gid1 < num_projs)
+ {
+ // d_sino[gid1*num_bins+gid0] *= d_filter[gid0];
+ d_sino[gid1*num_bins+gid0].x *= d_filter[gid0].x;
+ d_sino[gid1*num_bins+gid0].y *= d_filter[gid0].x;
+ }
+}
+
+// copy only the real part of the valid data to the real array
+kernel void cpy2d_c2r(
+ global float* d_sino,
+ global float2* d_sino_complex,
+ int num_bins,
+ int num_projs,
+ int fft_size)
+{
+ int gid0 = get_global_id(0);
+ int gid1 = get_global_id(1);
+ if (gid0 < num_bins && gid1 < num_projs) {
+ d_sino[gid1*num_bins+gid0] = d_sino_complex[gid1*fft_size+gid0].x;
+ }
+}
+
+
+
+
+
+
+
diff --git a/src/silx/resources/opencl/bitonic.cl b/src/silx/resources/opencl/bitonic.cl
new file mode 100644
index 0000000..4096ce8
--- /dev/null
+++ b/src/silx/resources/opencl/bitonic.cl
@@ -0,0 +1,569 @@
+/*############################################################################
+# Sort elements within a vector by Matthew Scarpino,
+# Taken from his book "OpenCL in Action",
+# November 2011 ISBN 9781617290176
+# Original license for the code: "public domain"
+#
+# Originally this code is public domain. The MIT license has been added
+# by J. Kieffer (jerome.kieffer@esrf.eu) to provide a disclaimer.
+# J. Kieffer does not claim authorship of this code developed by .
+#
+# Copyright (c) 2011 Matthew Scarpino
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+#############################################################################*/
+
+#define VECTOR_SORT_BOOK(input, dir) \
+ comp = abs(input > shuffle(input, mask2)) ^ dir; \
+ input = shuffle(input, comp * 2 + add2); \
+ comp = abs(input > shuffle(input, mask1)) ^ dir; \
+ input = shuffle(input, comp + add1); \
+
+
+#define VECTOR_SWAP_BOOK(in1, in2, dir) \
+ input1 = in1; input2 = in2; \
+ comp = (abs(input1 > input2) ^ dir) * 4 + add3; \
+ in1 = shuffle2(input1, input2, comp); \
+ in2 = shuffle2(input2, input1, comp); \
+
+
+// The _FILE extension correspond to the formula found in the "OpenCL in Action" supplementary files
+#define VECTOR_SORT_FILE(input, dir)\
+ comp = (input < shuffle(input, mask2)) ^ dir;\
+ input = shuffle(input, as_uint4(comp * 2 + add2));\
+ comp = (input < shuffle(input, mask1)) ^ dir;\
+ input = shuffle(input, as_uint4(comp + add1));\
+
+
+#define VECTOR_SWAP_FILE(input1, input2, dir)\
+ temp = input1;\
+ comp = ((input1 < input2) ^ dir) * 4 + add3;\
+ input1 = shuffle2(input1, input2, as_uint4(comp));\
+ input2 = shuffle2(input2, temp, as_uint4(comp));\
+
+
+
+// Functions to be called from an actual kernel.
+
+static float8 my_sort_file(uint local_id, uint group_id, uint local_size,
+ float8 input, local float4 *l_data)
+{
+ float4 input1, input2, temp;
+ float8 output;
+
+ int dir;
+ uint id, size, stride;
+ int4 comp;
+
+ uint4 mask1 = (uint4)(1, 0, 3, 2);
+ uint4 mask2 = (uint4)(2, 3, 0, 1);
+ uint4 mask3 = (uint4)(3, 2, 1, 0);
+
+ int4 add1 = (int4)(1, 1, 3, 3);
+ int4 add2 = (int4)(2, 3, 2, 3);
+ int4 add3 = (int4)(1, 2, 2, 3);
+
+ // retrieve input data
+ input1 = (float4)(input.s0, input.s1, input.s2, input.s3);
+ input2 = (float4)(input.s4, input.s5, input.s6, input.s7);
+
+ // Find global address
+ id = local_id * 2;
+
+ /* Sort input 1 - ascending */
+ comp = input1 < shuffle(input1, mask1);
+ input1 = shuffle(input1, as_uint4(comp + add1));
+ comp = input1 < shuffle(input1, mask2);
+ input1 = shuffle(input1, as_uint4(comp * 2 + add2));
+ comp = input1 < shuffle(input1, mask3);
+ input1 = shuffle(input1, as_uint4(comp + add3));
+
+ /* Sort input 2 - descending */
+ comp = input2 > shuffle(input2, mask1);
+ input2 = shuffle(input2, as_uint4(comp + add1));
+ comp = input2 > shuffle(input2, mask2);
+ input2 = shuffle(input2, as_uint4(comp * 2 + add2));
+ comp = input2 > shuffle(input2, mask3);
+ input2 = shuffle(input2, as_uint4(comp + add3));
+
+ /* Swap corresponding elements of input 1 and 2 */
+ add3 = (int4)(4, 5, 6, 7);
+ dir = - (int) (local_id % 2);
+ temp = input1;
+ comp = ((input1 < input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, as_uint4(comp));
+ input2 = shuffle2(input2, temp, as_uint4(comp));
+
+ /* Sort data and store in local memory */
+ VECTOR_SORT_FILE(input1, dir);
+ VECTOR_SORT_FILE(input2, dir);
+ l_data[id] = input1;
+ l_data[id+1] = input2;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* Create bitonic set */
+ for(size = 2; size < local_size; size <<= 1) {
+ dir = - (int) (local_id/size & 1) ;
+
+ for(stride = size; stride > 1; stride >>= 1) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ id = local_id + (local_id/stride)*stride;
+ VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ id = local_id * 2;
+ input1 = l_data[id];
+ input2 = l_data[id+1];
+ temp = input1;
+ comp = ((input1 < input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, as_uint4(comp));
+ input2 = shuffle2(input2, temp, as_uint4(comp));
+ VECTOR_SORT_FILE(input1, dir);
+ VECTOR_SORT_FILE(input2, dir);
+ l_data[id] = input1;
+ l_data[id+1] = input2;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ /* Perform bitonic merge */
+ dir = - (int) (group_id % 2);
+ for(stride = local_size; stride > 1; stride >>= 1)
+ {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ id = local_id + (local_id/stride)*stride;
+ VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* Perform final sort */
+ id = local_id * 2;
+ input1 = l_data[id];
+ input2 = l_data[id+1];
+ temp = input1;
+ comp = ((input1 < input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, as_uint4(comp));
+ input2 = shuffle2(input2, temp, as_uint4(comp));
+ VECTOR_SORT_FILE(input1, dir);
+ VECTOR_SORT_FILE(input2, dir);
+
+ // setup output and return it
+ output = (float8)(input1, input2);
+ return output;
+}
+
+static float8 my_sort_book(uint local_id, uint group_id, uint local_size,
+ float8 input, local float4 *l_data)
+{
+ float4 input1, input2, temp;
+ float8 output;
+ uint4 comp, swap, mask1, mask2, add1, add2, add3;
+ uint id, dir, size, stride;
+ mask1 = (uint4)(1, 0, 3, 2);
+ swap = (uint4)(0, 0, 1, 1);
+ add1 = (uint4)(0, 0, 2, 2);
+ mask2 = (uint4)(2, 3, 0, 1);
+ add2 = (uint4)(0, 1, 0, 1);
+ add3 = (uint4)(0, 1, 2, 3);
+
+ // retrieve input data
+ input1 = (float4)(input.s0, input.s1, input.s2, input.s3);
+ input2 = (float4)(input.s4, input.s5, input.s6, input.s7);
+
+ // Find global address
+ id = local_id * 2;
+
+ //Sort first vector
+
+ comp = abs(input1 > shuffle(input1, mask1));
+ input1 = shuffle(input1, comp ^ swap + add1);
+ comp = abs(input1 > shuffle(input1, mask2));
+ input1 = shuffle(input1, comp * 2 + add2);
+ comp = abs(input1 > shuffle(input1, mask1));
+ input1 = shuffle(input1, comp + add1);
+
+ //Sort second vector
+ comp = abs(input2 < shuffle(input2, mask1));
+ input2 = shuffle(input2, comp ^ swap + add1);
+ comp = abs(input2 < shuffle(input2, mask2));
+ input2 = shuffle(input2, comp * 2 + add2);
+ comp = abs(input2 < shuffle(input2, mask1));
+ input2 = shuffle(input2, comp + add1);
+
+ // Swap elements
+ dir = local_id % 2;
+ temp = input1;
+ comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, comp);
+ input2 = shuffle2(input2, temp, comp);
+ VECTOR_SORT_BOOK(input1, dir);
+ VECTOR_SORT_BOOK(input2, dir);
+ l_data[id] = input1;
+ l_data[id+1] = input2;
+
+ // Perform upper stages
+ for(size = 2; size < local_size; size <<= 1)
+ {
+ dir = local_id/size & 1;
+
+ //Perform lower stages
+ for(stride = size; stride > 1; stride >>= 1)
+ {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ id = local_id + (local_id/stride)*stride;
+ VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ id = local_id * 2;
+ input1 = l_data[id];
+ input2 = l_data[id+1];
+ temp = input1;
+ comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, comp);
+ input2 = shuffle2(input2, temp, comp);
+ VECTOR_SORT_BOOK(input1, dir);
+ VECTOR_SORT_BOOK(input2, dir);
+ l_data[id] = input1;
+ l_data[id+1] = input2;
+ }
+ dir = group_id % 2;
+
+ // Perform bitonic merge
+ for(stride = local_size; stride > 1; stride >>= 1)
+ {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ id = local_id + (local_id/stride)*stride;
+ VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ id = local_id * 2;
+ input1 = l_data[id]; input2 = l_data[id+1];
+ temp = input1;
+ comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, comp);
+ input2 = shuffle2(input2, temp, comp);
+ VECTOR_SORT_BOOK(input1, dir);
+ VECTOR_SORT_BOOK(input2, dir);
+
+ // setup output and return it
+ output = (float8)(input1, input2);
+ return output;
+}
+
+
+
+//////////////
+// Kernels
+//////////////
+
+// Perform the sort on the whole array
+// dim0: wg=number_of_element/8
+
+kernel void bsort_all(global float4 *g_data,
+ local float4 *l_data)
+{
+ float4 input1, input2;
+ float8 input, output;
+ uint id, global_start;
+ // Find global address
+ id = get_local_id(0) * 2;
+ global_start = get_group_id(0) * get_local_size(0) * 2 + id;
+
+ input1 = g_data[global_start];
+ input2 = g_data[global_start+1];
+ input = (float8)(input1, input2);
+ output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
+ input, l_data);
+ input1 = (float4) (output.s0, output.s1, output.s2, output.s3);
+ input2 = (float4) (output.s4, output.s5, output.s6, output.s7);
+ g_data[global_start] = input1;
+ g_data[global_start+1] = input2;
+}
+
+
+// Perform the sort along the horizontal axis of a 2D image
+// dim0 = y: wg=1
+// dim1 = x: wg=number_of_element/8
+kernel void bsort_horizontal(global float *g_data,
+ local float4 *l_data)
+{
+ float8 input, output;
+ uint id, global_start, offset;
+
+ // Find global address
+ offset = get_global_size(1)*get_global_id(0)*8;
+ id = get_local_id(1) * 8;
+ global_start = offset + get_group_id(1) * get_local_size(1) * 8 + id;
+
+ input = (float8)(g_data[global_start ],
+ g_data[global_start + 1],
+ g_data[global_start + 2],
+ g_data[global_start + 3],
+ g_data[global_start + 4],
+ g_data[global_start + 5],
+ g_data[global_start + 6],
+ g_data[global_start + 7]);
+
+ output = my_sort_file(get_local_id(1), get_group_id(1), get_local_size(1),
+ input, l_data);
+
+ g_data[global_start ] = output.s0;
+ g_data[global_start + 1] = output.s1;
+ g_data[global_start + 2] = output.s2;
+ g_data[global_start + 3] = output.s3;
+ g_data[global_start + 4] = output.s4;
+ g_data[global_start + 5] = output.s5;
+ g_data[global_start + 6] = output.s6;
+ g_data[global_start + 7] = output.s7;
+}
+
+
+// Perform the sort along the vertical axis
+// dim0 = y: wg=number_of_element/8
+// dim1 = x: wg=1
+// check if transposing +bsort_horizontal is not more efficient ?
+
+kernel void bsort_vertical(global float *g_data,
+ local float4 *l_data)
+{
+ // we need to read 8 float position along the vertical axis
+ float8 input, output;
+ uint id, global_start, padding;
+
+ // Find global address
+ padding = get_global_size(1);
+ id = get_local_id(0) * 8 * padding + get_global_id(1);
+ global_start = get_group_id(0) * get_local_size(0) * 8 * padding + id;
+
+ input = (float8)(g_data[global_start ],
+ g_data[global_start + padding ],
+ g_data[global_start + 2*padding],
+ g_data[global_start + 3*padding],
+ g_data[global_start + 4*padding],
+ g_data[global_start + 5*padding],
+ g_data[global_start + 6*padding],
+ g_data[global_start + 7*padding]);
+
+ output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
+ input, l_data);
+ g_data[global_start ] = output.s0;
+ g_data[global_start + padding ] = output.s1;
+ g_data[global_start + 2*padding ] = output.s2;
+ g_data[global_start + 3*padding ] = output.s3;
+ g_data[global_start + 4*padding ] = output.s4;
+ g_data[global_start + 5*padding ] = output.s5;
+ g_data[global_start + 6*padding ] = output.s6;
+ g_data[global_start + 7*padding ] = output.s7;
+}
+
+
+//Tested working reference kernel frm the book. This only works under Linux
+kernel void bsort_book(global float4 *g_data,
+ local float4 *l_data) {
+ float4 input1, input2, temp;
+ uint4 comp, swap, mask1, mask2, add1, add2, add3;
+ uint id, dir, global_start, size, stride;
+ mask1 = (uint4)(1, 0, 3, 2);
+ swap = (uint4)(0, 0, 1, 1);
+ add1 = (uint4)(0, 0, 2, 2);
+ mask2 = (uint4)(2, 3, 0, 1);
+ add2 = (uint4)(0, 1, 0, 1);
+ add3 = (uint4)(0, 1, 2, 3);
+
+ // Find global address
+ id = get_local_id(0) * 2;
+ global_start = get_group_id(0) * get_local_size(0) * 2 + id;
+
+ //Sort first vector
+ input1 = g_data[global_start];
+ input2 = g_data[global_start+1];
+ comp = abs(input1 > shuffle(input1, mask1));
+ input1 = shuffle(input1, comp ^ swap + add1);
+ comp = abs(input1 > shuffle(input1, mask2));
+ input1 = shuffle(input1, comp * 2 + add2);
+ comp = abs(input1 > shuffle(input1, mask1));
+ input1 = shuffle(input1, comp + add1);
+
+ //Sort second vector
+ comp = abs(input2 < shuffle(input2, mask1));
+ input2 = shuffle(input2, comp ^ swap + add1);
+ comp = abs(input2 < shuffle(input2, mask2));
+ input2 = shuffle(input2, comp * 2 + add2);
+ comp = abs(input2 < shuffle(input2, mask1));
+ input2 = shuffle(input2, comp + add1);
+
+ // Swap elements
+ dir = get_local_id(0) % 2;
+ temp = input1;
+ comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, comp);
+ input2 = shuffle2(input2, temp, comp);
+ VECTOR_SORT_BOOK(input1, dir);
+ VECTOR_SORT_BOOK(input2, dir);
+ l_data[id] = input1;
+ l_data[id+1] = input2;
+
+ // Perform upper stages
+ for(size = 2; size < get_local_size(0); size <<= 1) {
+ dir = get_local_id(0)/size & 1;
+
+ //Perform lower stages
+ for(stride = size; stride > 1; stride >>= 1) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ id = get_local_id(0) +
+ (get_local_id(0)/stride)*stride;
+ VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ id = get_local_id(0) * 2;
+ input1 = l_data[id];
+ input2 = l_data[id+1];
+ temp = input1;
+ comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, comp);
+ input2 = shuffle2(input2, temp, comp);
+ VECTOR_SORT_BOOK(input1, dir);
+ VECTOR_SORT_BOOK(input2, dir);
+ l_data[id] = input1;
+ l_data[id+1] = input2;
+ }
+ dir = get_group_id(0) % 2;
+ // Perform bitonic merge
+ for(stride = get_local_size(0); stride > 1; stride >>= 1) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ id = get_local_id(0) +
+ (get_local_id(0)/stride)*stride;
+ VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ id = get_local_id(0) * 2;
+ input1 = l_data[id]; input2 = l_data[id+1];
+ temp = input1;
+ comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, comp);
+ input2 = shuffle2(input2, temp, comp);
+ VECTOR_SORT_BOOK(input1, dir);
+ VECTOR_SORT_BOOK(input2, dir);
+ g_data[global_start] = input1;
+ g_data[global_start+1] = input2;
+ }
+
+//Tested working reference kernel from the addition files. This only works under any operating system
+/* Perform initial sort */
+kernel void bsort_file(global float4 *g_data, local float4 *l_data) {
+
+ int dir;
+ uint id, global_start, size, stride;
+ float4 input1, input2, temp;
+ int4 comp;
+
+ uint4 mask1 = (uint4)(1, 0, 3, 2);
+ uint4 mask2 = (uint4)(2, 3, 0, 1);
+ uint4 mask3 = (uint4)(3, 2, 1, 0);
+
+ int4 add1 = (int4)(1, 1, 3, 3);
+ int4 add2 = (int4)(2, 3, 2, 3);
+ int4 add3 = (int4)(1, 2, 2, 3);
+
+ id = get_local_id(0) * 2;
+ global_start = get_group_id(0) * get_local_size(0) * 2 + id;
+
+ input1 = g_data[global_start];
+ input2 = g_data[global_start+1];
+
+ /* Sort input 1 - ascending */
+ comp = input1 < shuffle(input1, mask1);
+ input1 = shuffle(input1, as_uint4(comp + add1));
+ comp = input1 < shuffle(input1, mask2);
+ input1 = shuffle(input1, as_uint4(comp * 2 + add2));
+ comp = input1 < shuffle(input1, mask3);
+ input1 = shuffle(input1, as_uint4(comp + add3));
+
+ /* Sort input 2 - descending */
+ comp = input2 > shuffle(input2, mask1);
+ input2 = shuffle(input2, as_uint4(comp + add1));
+ comp = input2 > shuffle(input2, mask2);
+ input2 = shuffle(input2, as_uint4(comp * 2 + add2));
+ comp = input2 > shuffle(input2, mask3);
+ input2 = shuffle(input2, as_uint4(comp + add3));
+
+ /* Swap corresponding elements of input 1 and 2 */
+ add3 = (int4)(4, 5, 6, 7);
+ dir = - (int)(get_local_id(0) % 2);
+ temp = input1;
+ comp = ((input1 < input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, as_uint4(comp));
+ input2 = shuffle2(input2, temp, as_uint4(comp));
+
+ /* Sort data and store in local memory */
+ VECTOR_SORT_FILE(input1, dir);
+ VECTOR_SORT_FILE(input2, dir);
+ l_data[id] = input1;
+ l_data[id+1] = input2;
+
+ /* Create bitonic set */
+ for(size = 2; size < get_local_size(0); size <<= 1) {
+ dir = - (int)(get_local_id(0)/size & 1);
+
+ for(stride = size; stride > 1; stride >>= 1) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ id = get_local_id(0) + (get_local_id(0)/stride)*stride;
+ VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ id = get_local_id(0) * 2;
+ input1 = l_data[id]; input2 = l_data[id+1];
+ temp = input1;
+ comp = ((input1 < input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, as_uint4(comp));
+ input2 = shuffle2(input2, temp, as_uint4(comp));
+ VECTOR_SORT_FILE(input1, dir);
+ VECTOR_SORT_FILE(input2, dir);
+ l_data[id] = input1;
+ l_data[id+1] = input2;
+ }
+
+ /* Perform bitonic merge */
+ dir = - (int)(get_group_id(0) % 2);
+ for(stride = get_local_size(0); stride > 1; stride >>= 1) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ id = get_local_id(0) + (get_local_id(0)/stride)*stride;
+ VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* Perform final sort */
+ id = get_local_id(0) * 2;
+ input1 = l_data[id]; input2 = l_data[id+1];
+ temp = input1;
+ comp = ((input1 < input2) ^ dir) * 4 + add3;
+ input1 = shuffle2(input1, input2, as_uint4(comp));
+ input2 = shuffle2(input2, temp, as_uint4(comp));
+ VECTOR_SORT_FILE(input1, dir);
+ VECTOR_SORT_FILE(input2, dir);
+ g_data[global_start] = input1;
+ g_data[global_start+1] = input2;
+}
+
diff --git a/src/silx/resources/opencl/codec/byte_offset.cl b/src/silx/resources/opencl/codec/byte_offset.cl
new file mode 100644
index 0000000..56a24c4
--- /dev/null
+++ b/src/silx/resources/opencl/codec/byte_offset.cl
@@ -0,0 +1,235 @@
+/*
+ * Project: SILX: A data analysis tool-kit
+ *
+ * Copyright (C) 2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* To decompress CBF byte-offset compressed in parallel on GPU one needs to:
+ * - Set all values in mask and exception counter to zero.
+ * - Mark regions with exceptions and set values without exception.
+ * This generates the values (zeros for exceptions), the exception mask,
+ * counts the number of exception region and provides a start position for
+ * each exception.
+ * - Treat exceptions. For this, one thread in a workgoup treats a complete
+ * masked region in a serial fashion. All regions are treated in parallel.
+ * Values written at this stage are marked in the mask with -1.
+ * - Double scan: inclusive cum sum for values, exclusive cum sum to generate
+ * indices in output array. Values with mask = 1 are considered as 0.
+ * - Compact and copy output by removing duplicated values in exceptions.
+ */
+
+kernel void mark_exceptions(global char* raw,
+ int size,
+ int full_size,
+ global int* mask,
+ global int* values,
+ global int* cnt,
+ global int* exc)
+{
+ int gid;
+ gid = get_global_id(0);
+ if (gid<size)
+ {
+ int value, position;
+ value = raw[gid];
+ if (value == -128)
+ {
+ int maxi;
+ values[gid] = 0;
+ position = atomic_inc(cnt);
+ exc[position] = gid;
+ maxi = size - 1;
+ mask[gid] = 1;
+ mask[min(maxi, gid+1)] = 1;
+ mask[min(maxi, gid+2)] = 1;
+
+ if (((int) raw[min(gid+1, maxi)] == 0) &&
+ ((int) raw[min(gid+2, maxi)] == -128))
+ {
+ mask[min(maxi, gid+3)] = 1;
+ mask[min(maxi, gid+4)] = 1;
+ mask[min(maxi, gid+5)] = 1;
+ mask[min(maxi, gid+6)] = 1;
+ }
+ }
+ else
+ { // treat simple data
+
+ values[gid] = value;
+ }
+ }
+ else if (gid<full_size)
+ {
+ mask[gid]=1;
+ values[gid] = 0;
+ }
+}
+
+//run with WG=1, as may as exceptions
+kernel void treat_exceptions(global char* raw, //raw compressed stream
+ int size, //size of the raw compressed stream
+ global int* mask, //tells if the value is masked
+ global int* exc, //array storing the position of the start of exception zones
+ global int* values)// stores decompressed values.
+{
+ int gid = get_global_id(0);
+ int inp_pos = exc[gid];
+ if ((inp_pos<=0) || ((int)mask[inp_pos - 1] == 0))
+ {
+ int value, is_masked, next_value, inc;
+ is_masked = (mask[inp_pos] != 0);
+ while ((is_masked) && (inp_pos<size))
+ {
+ value = (int) raw[inp_pos];
+ if (value == -128)
+ { // this correspond to 16 bits exception
+ uchar low_byte = raw[inp_pos+1];
+ char high_byte = raw[inp_pos+2] ;
+ next_value = high_byte<<8 | low_byte;
+ if (next_value == -32768)
+ { // this correspond to 32 bits exception
+ uchar low_byte1 = raw[inp_pos+3],
+ low_byte2 = raw[inp_pos+4],
+ low_byte3 = raw[inp_pos+5];
+ char high_byte4 = raw[inp_pos+6] ;
+ value = high_byte4<<24 | low_byte3<<16 | low_byte2<<8 | low_byte1;
+ inc = 7;
+ }
+ else
+ {
+ value = next_value;
+ inc = 3;
+ }
+ }
+ else
+ {
+ inc = 1;
+ }
+ values[inp_pos] = value;
+ mask[inp_pos] = -1; // mark the processed data as valid in the mask
+ inp_pos += inc;
+ is_masked = (mask[inp_pos] != 0);
+ }
+ }
+}
+
+// copy the values of the elements to definitive position
+kernel void copy_result_int(global int* values,
+ global int* indexes,
+ int in_size,
+ int out_size,
+ global int* output
+ )
+{
+ int gid = get_global_id(0);
+ if (gid < in_size)
+ {
+ int current = max(indexes[gid], 0),
+ next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1];
+ //we keep always the last element
+ if ((current <= out_size) && (current < next))
+ {
+ output[current] = values[gid];
+ }
+ }
+}
+
+// copy the values of the elements to definitive position
+kernel void copy_result_float(global int* values,
+ global int* indexes,
+ int in_size,
+ int out_size,
+ global float* output
+ )
+{
+ int gid = get_global_id(0);
+ if (gid<in_size)
+ {
+ int current = max(indexes[gid], 0),
+ next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1];
+ if ((current < out_size) && (current < next))
+ {
+ output[current] = (float) values[gid];
+ }
+ }
+}
+
+
+// combined memset for all arrays used for Byte Offset decompression
+kernel void byte_offset_memset(global char* raw,
+ global int* mask,
+ global int* index,
+ global int* result,
+ int full_size,
+ int actual_size
+ )
+{
+ int gid = get_global_id(0);
+ if (gid < full_size)
+ {
+ raw[gid] = 0;
+ index[gid] = 0;
+ result[gid] = 0;
+ if (gid<actual_size)
+ {
+ mask[gid] = 0;
+ }
+ else
+ {
+ mask[gid] = 1;
+ }
+
+ }
+}
+
+
+//Simple memset kernel for char arrays
+kernel void fill_char_mem(global char* ary,
+ int size,
+ char pattern,
+ int start_at)
+{
+ int gid = get_global_id(0);
+ if ((gid >= start_at) && (gid < size))
+ {
+ ary[gid] = pattern;
+ }
+}
+
+//Simple memset kernel for int arrays
+kernel void fill_int_mem(global int* ary,
+ int size,
+ int pattern,
+ int start_at)
+{
+ int gid = get_global_id(0);
+ if ((gid >= start_at) && (gid < size))
+ {
+ ary[gid] = pattern;
+ }
+}
+
diff --git a/src/silx/resources/opencl/convolution.cl b/src/silx/resources/opencl/convolution.cl
new file mode 100644
index 0000000..629b8fc
--- /dev/null
+++ b/src/silx/resources/opencl/convolution.cl
@@ -0,0 +1,312 @@
+#define MAX_CONST_SIZE 16384
+
+/******************************************************************************/
+/**************************** Macros ******************************************/
+/******************************************************************************/
+
+// Get the center index of the filter,
+// and the "half-Left" and "half-Right" lengths.
+// In the case of an even-sized filter, the center is shifted to the left.
+#define GET_CENTER_HL(hlen){\
+ if (hlen & 1) {\
+ c = hlen/2;\
+ hL = c;\
+ hR = c;\
+ }\
+ else {\
+ c = hlen/2 - 1;\
+ hL = c;\
+ hR = c+1;\
+ }\
+}\
+
+// Boundary handling modes
+#define CONV_MODE_REFLECT 0 // cba|abcd|dcb
+#define CONV_MODE_NEAREST 1 // aaa|abcd|ddd
+#define CONV_MODE_WRAP 2 // bcd|abcd|abc
+#define CONV_MODE_CONSTANT 3 // 000|abcd|000
+#ifndef USED_CONV_MODE
+ #define USED_CONV_MODE CONV_MODE_NEAREST
+#endif
+
+#define CONV_PERIODIC_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x += Nx; if (idx_x >= Nx) idx_x -= Nx;
+#define CONV_PERIODIC_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y += Ny; if (idx_y >= Ny) idx_y -= Ny;
+#define CONV_PERIODIC_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z += Nz; if (idx_z >= Nz) idx_z -= Nz;
+
+#define CONV_NEAREST_IDX_X int idx_x = clamp((int) (gidx - c + jx), 0, Nx-1);
+#define CONV_NEAREST_IDX_Y int idx_y = clamp((int) (gidy - c + jy), 0, Ny-1);
+#define CONV_NEAREST_IDX_Z int idx_z = clamp((int) (gidz - c + jz), 0, Nz-1);
+
+#define CONV_REFLECT_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x = -idx_x-1; if (idx_x >= Nx) idx_x = Nx-(idx_x-(Nx-1));
+#define CONV_REFLECT_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y = -idx_y-1; if (idx_y >= Ny) idx_y = Ny-(idx_y-(Ny-1));
+#define CONV_REFLECT_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z = -idx_z-1; if (idx_z >= Nz) idx_z = Nz-(idx_z-(Nz-1));
+
+
+#if USED_CONV_MODE == CONV_MODE_REFLECT
+ #define CONV_IDX_X CONV_REFLECT_IDX_X
+ #define CONV_IDX_Y CONV_REFLECT_IDX_Y
+ #define CONV_IDX_Z CONV_REFLECT_IDX_Z
+#elif USED_CONV_MODE == CONV_MODE_NEAREST
+ #define CONV_IDX_X CONV_NEAREST_IDX_X
+ #define CONV_IDX_Y CONV_NEAREST_IDX_Y
+ #define CONV_IDX_Z CONV_NEAREST_IDX_Z
+#elif USED_CONV_MODE == CONV_MODE_WRAP
+ #define CONV_IDX_X CONV_PERIODIC_IDX_X
+ #define CONV_IDX_Y CONV_PERIODIC_IDX_Y
+ #define CONV_IDX_Z CONV_PERIODIC_IDX_Z
+#elif USED_CONV_MODE == CONV_MODE_CONSTANT
+ #error "constant not implemented yet"
+#else
+ #error "Unknown convolution mode"
+#endif
+
+
+
+// Image access patterns
+#define READ_IMAGE_1D_X input[(gidz*Ny + gidy)*Nx + idx_x]
+#define READ_IMAGE_1D_Y input[(gidz*Ny + idx_y)*Nx + gidx]
+#define READ_IMAGE_1D_Z input[(idx_z*Ny + gidy)*Nx + gidx]
+
+#define READ_IMAGE_2D_XY input[(gidz*Ny + idx_y)*Nx + idx_x]
+#define READ_IMAGE_2D_XZ input[(idx_z*Ny + gidy)*Nx + idx_x]
+#define READ_IMAGE_2D_YZ input[(idx_z*Ny + idx_y)*Nx + gidx]
+
+#define READ_IMAGE_3D_XYZ input[(idx_z*Ny + idx_y)*Nx + idx_x]
+
+
+
+/******************************************************************************/
+/**************************** 1D Convolution **********************************/
+/******************************************************************************/
+
+
+// Convolution with 1D kernel along axis "X" (fast dimension)
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "X".
+__kernel void convol_1D_X(
+ const __global float * input,
+ __global float * output,
+ __global float * filter,
+ int L, // filter size
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(L);
+ float sum = 0.0f;
+
+ for (int jx = 0; jx <= hR+hL; jx++) {
+ CONV_IDX_X; // Get index "x"
+ sum += READ_IMAGE_1D_X * filter[L-1 - jx];
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+// Convolution with 1D kernel along axis "Y"
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y".
+__kernel void convol_1D_Y(
+ const __global float * input,
+ __global float * output,
+ __global float * filter,
+ int L, // filter size
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(L);
+ float sum = 0.0f;
+
+ for (int jy = 0; jy <= hR+hL; jy++) {
+ CONV_IDX_Y; // Get index "y"
+ sum += READ_IMAGE_1D_Y * filter[L-1 - jy];
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+// Convolution with 1D kernel along axis "Z"
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z".
+__kernel void convol_1D_Z(
+ const __global float * input,
+ __global float * output,
+ __global float * filter,
+ int L, // filter size
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(L);
+ float sum = 0.0f;
+
+ for (int jz = 0; jz <= hR+hL; jz++) {
+ CONV_IDX_Z; // Get index "z"
+ sum += READ_IMAGE_1D_Z * filter[L-1 - jz];
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+/******************************************************************************/
+/**************************** 2D Convolution **********************************/
+/******************************************************************************/
+
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_XY(
+ const __global float * input,
+ __global float * output,
+ __global float * filter,
+ int Lx, // filter number of columns,
+ int Ly, // filter number of rows,
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Lx);
+ float sum = 0.0f;
+
+ for (int jy = 0; jy <= hR+hL; jy++) {
+ CONV_IDX_Y; // Get index "y"
+ for (int jx = 0; jx <= hR+hL; jx++) {
+ CONV_IDX_X; // Get index "x"
+ sum += READ_IMAGE_2D_XY * filter[(Ly-1-jy)*Lx + (Lx-1 - jx)];
+ }
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_XZ(
+ const __global float * input,
+ __global float * output,
+ __global float * filter,
+ int Lx, // filter number of columns,
+ int Lz, // filter number of rows,
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Lx);
+ float sum = 0.0f;
+
+ for (int jz = 0; jz <= hR+hL; jz++) {
+ CONV_IDX_Z; // Get index "z"
+ for (int jx = 0; jx <= hR+hL; jx++) {
+ CONV_IDX_X; // Get index "x"
+ sum += READ_IMAGE_2D_XZ * filter[(Lz-1-jz)*Lx + (Lx-1 - jx)];
+ }
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_YZ(
+ const __global float * input,
+ __global float * output,
+ __global float * filter,
+ int Ly, // filter number of columns,
+ int Lz, // filter number of rows,
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Ly);
+ float sum = 0.0f;
+
+ for (int jz = 0; jz <= hR+hL; jz++) {
+ CONV_IDX_Z; // Get index "z"
+ for (int jy = 0; jy <= hR+hL; jy++) {
+ CONV_IDX_Y; // Get index "y"
+ sum += READ_IMAGE_2D_YZ * filter[(Lz-1-jz)*Ly + (Ly-1 - jy)];
+ }
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+
+/******************************************************************************/
+/**************************** 3D Convolution **********************************/
+/******************************************************************************/
+
+// Convolution with 3D kernel
+__kernel void convol_3D_XYZ(
+ const __global float * input,
+ __global float * output,
+ __global float * filter,
+ int Lx, // filter number of columns,
+ int Ly, // filter number of rows,
+ int Lz, // filter number of rows,
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Lx);
+ float sum = 0.0f;
+
+ for (int jz = 0; jz <= hR+hL; jz++) {
+ CONV_IDX_Z; // Get index "z"
+ for (int jy = 0; jy <= hR+hL; jy++) {
+ CONV_IDX_Y; // Get index "y"
+ for (int jx = 0; jx <= hR+hL; jx++) {
+ CONV_IDX_X; // Get index "x"
+ sum += READ_IMAGE_3D_XYZ * filter[((Lz-1-jz)*Ly + (Ly-1-jy))*Lx + (Lx-1 - jx)];
+ }
+ }
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
diff --git a/src/silx/resources/opencl/convolution_textures.cl b/src/silx/resources/opencl/convolution_textures.cl
new file mode 100644
index 0000000..517a67c
--- /dev/null
+++ b/src/silx/resources/opencl/convolution_textures.cl
@@ -0,0 +1,374 @@
+/******************************************************************************/
+/**************************** Macros ******************************************/
+/******************************************************************************/
+
+// Error handling
+#ifndef IMAGE_DIMS
+ #error "IMAGE_DIMS must be defined"
+#endif
+#ifndef FILTER_DIMS
+ #error "FILTER_DIMS must be defined"
+#endif
+#if FILTER_DIMS > IMAGE_DIMS
+ #error "Filter cannot have more dimensions than image"
+#endif
+
+// Boundary handling modes
+#define CONV_MODE_REFLECT 0 // CLK_ADDRESS_MIRRORED_REPEAT : cba|abcd|dcb
+#define CONV_MODE_NEAREST 1 // CLK_ADDRESS_CLAMP_TO_EDGE : aaa|abcd|ddd
+#define CONV_MODE_WRAP 2 // CLK_ADDRESS_REPEAT : bcd|abcd|abc
+#define CONV_MODE_CONSTANT 3 // CLK_ADDRESS_CLAMP : 000|abcd|000
+#ifndef USED_CONV_MODE
+ #define USED_CONV_MODE CONV_MODE_NEAREST
+#endif
+#if USED_CONV_MODE == CONV_MODE_REFLECT
+ #define CLK_BOUNDARY CLK_ADDRESS_MIRRORED_REPEAT
+ #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE
+ #define USE_NORM_COORDS
+#elif USED_CONV_MODE == CONV_MODE_NEAREST
+ #define CLK_BOUNDARY CLK_ADDRESS_CLAMP_TO_EDGE
+ #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE
+#elif USED_CONV_MODE == CONV_MODE_WRAP
+ #define CLK_BOUNDARY CLK_ADDRESS_REPEAT
+ #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE
+ #define USE_NORM_COORDS
+#elif USED_CONV_MODE == CONV_MODE_CONSTANT
+ #define CLK_BOUNDARY CLK_ADDRESS_CLAMP
+ #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE
+#else
+ #error "Unknown convolution mode"
+#endif
+
+
+
+// Convolution index for filter
+#define FILTER_INDEX(j) (Lx - 1 - j)
+
+// Filter access patterns
+#define READ_FILTER_1D(j) read_imagef(filter, (int2) (FILTER_INDEX(j), 0)).x;
+#define READ_FILTER_2D(jx, jy) read_imagef(filter, (int2) (FILTER_INDEX(jx), FILTER_INDEX(jy))).x;
+#define READ_FILTER_3D(jx, jy, jz) read_imagef(filter, (int4) (FILTER_INDEX(jx), FILTER_INDEX(jy), FILTER_INDEX(jz), 0)).x;
+
+
+// Convolution index for image
+#ifdef USE_NORM_COORDS
+ #define IMAGE_INDEX_X (gidx*1.0f +0.5f - c + jx)/Nx
+ #define IMAGE_INDEX_Y (gidy*1.0f +0.5f - c + jy)/Ny
+ #define IMAGE_INDEX_Z (gidz*1.0f +0.5f - c + jz)/Nz
+ #define RET_TYPE_1 float
+ #define RET_TYPE_2 float2
+ #define RET_TYPE_4 float4
+ #define C_ZERO 0.5f
+ #define GIDX (gidx*1.0f + 0.5f)/Nx
+ #define GIDY (gidy*1.0f + 0.5f)/Ny
+ #define GIDZ (gidz*1.0f + 0.5f)/Nz
+#else
+ #define IMAGE_INDEX_X (gidx - c + jx)
+ #define IMAGE_INDEX_Y (gidy - c + jy)
+ #define IMAGE_INDEX_Z (gidz - c + jz)
+ #define RET_TYPE_1 int
+ #define RET_TYPE_2 int2
+ #define RET_TYPE_4 int4
+ #define C_ZERO 0
+ #define GIDX gidx
+ #define GIDY gidy
+ #define GIDZ gidz
+#endif
+
+static const sampler_t sampler = CLK_COORDS | CLK_BOUNDARY | CLK_FILTER_NEAREST;
+
+// Image access patterns
+#define READ_IMAGE_1D read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, C_ZERO)).x
+
+#define READ_IMAGE_2D_X read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X , GIDY)).x
+#define READ_IMAGE_2D_Y read_imagef(input, sampler, (RET_TYPE_2) (GIDX, IMAGE_INDEX_Y)).x
+#define READ_IMAGE_2D_XY read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, IMAGE_INDEX_Y)).x
+
+#define READ_IMAGE_3D_X read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, GIDZ, C_ZERO)).x
+#define READ_IMAGE_3D_Y read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x
+#define READ_IMAGE_3D_Z read_imagef(input, sampler, (RET_TYPE_4) (GIDX, GIDY, IMAGE_INDEX_Z, C_ZERO)).x
+#define READ_IMAGE_3D_XY read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x
+#define READ_IMAGE_3D_XZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, IMAGE_INDEX_Z, C_ZERO)).x
+#define READ_IMAGE_3D_YZ read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x
+#define READ_IMAGE_3D_XYZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x
+
+
+// NOTE: pyopencl and OpenCL < 1.2 do not support image1d_t
+#if FILTER_DIMS == 1
+ #define FILTER_TYPE image2d_t
+ #define READ_FILTER_VAL(j) READ_FILTER_1D(j)
+#elif FILTER_DIMS == 2
+ #define FILTER_TYPE image2d_t
+ #define READ_FILTER_VAL(jx, jy) READ_FILTER_2D(jx, jy)
+#elif FILTER_DIMS == 3
+ #define FILTER_TYPE image3d_t
+ #define READ_FILTER_VAL(jx, jy, jz) READ_FILTER_3D(jx, jy, jz)
+#endif
+
+#if IMAGE_DIMS == 1
+ #define IMAGE_TYPE image2d_t
+ #define READ_IMAGE_X READ_IMAGE_1D
+#elif IMAGE_DIMS == 2
+ #define IMAGE_TYPE image2d_t
+ #define READ_IMAGE_X READ_IMAGE_2D_X
+ #define READ_IMAGE_Y READ_IMAGE_2D_Y
+ #define READ_IMAGE_XY READ_IMAGE_2D_XY
+#elif IMAGE_DIMS == 3
+ #define IMAGE_TYPE image3d_t
+ #define READ_IMAGE_X READ_IMAGE_3D_X
+ #define READ_IMAGE_Y READ_IMAGE_3D_Y
+ #define READ_IMAGE_Z READ_IMAGE_3D_Z
+ #define READ_IMAGE_XY READ_IMAGE_3D_XY
+ #define READ_IMAGE_XZ READ_IMAGE_3D_XZ
+ #define READ_IMAGE_YZ READ_IMAGE_3D_YZ
+ #define READ_IMAGE_XYZ READ_IMAGE_3D_XYZ
+#endif
+
+
+// Get the center index of the filter,
+// and the "half-Left" and "half-Right" lengths.
+// In the case of an even-sized filter, the center is shifted to the left.
+#define GET_CENTER_HL(hlen){\
+ if (hlen & 1) {\
+ c = hlen/2;\
+ hL = c;\
+ hR = c;\
+ }\
+ else {\
+ c = hlen/2 - 1;\
+ hL = c;\
+ hR = c+1;\
+ }\
+}\
+
+
+
+/******************************************************************************/
+/**************************** 1D Convolution **********************************/
+/******************************************************************************/
+
+#if FILTER_DIMS == 1
+// Convolution with 1D kernel along axis "X" (fast dimension)
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "X".
+__kernel void convol_1D_X_tex(
+ read_only IMAGE_TYPE input,
+ __global float * output,
+ read_only FILTER_TYPE filter,
+ int Lx, // filter size
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Lx);
+ float sum = 0.0f;
+
+ for (int jx = 0; jx <= hR+hL; jx++) {
+ sum += READ_IMAGE_X * READ_FILTER_VAL(jx);
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+#if IMAGE_DIMS >= 2
+// Convolution with 1D kernel along axis "Y"
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y".
+__kernel void convol_1D_Y_tex(
+ read_only IMAGE_TYPE input,
+ __global float * output,
+ read_only FILTER_TYPE filter,
+ int Lx, // filter size
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Lx);
+ float sum = 0.0f;
+
+ for (int jy = 0; jy <= hR+hL; jy++) {
+ sum += READ_IMAGE_Y * READ_FILTER_VAL(jy);
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+#endif
+
+#if IMAGE_DIMS == 3
+// Convolution with 1D kernel along axis "Z"
+// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z".
+__kernel void convol_1D_Z_tex(
+ read_only IMAGE_TYPE input,
+ __global float * output,
+ read_only FILTER_TYPE filter,
+ int Lx, // filter size
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Lx);
+ float sum = 0.0f;
+
+ for (int jz = 0; jz <= hR+hL; jz++) {
+ sum += READ_IMAGE_Z * READ_FILTER_VAL(jz);
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+#endif
+#endif
+
+/******************************************************************************/
+/**************************** 2D Convolution **********************************/
+/******************************************************************************/
+
+#if IMAGE_DIMS >= 2 && FILTER_DIMS == 2
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_XY_tex(
+ read_only IMAGE_TYPE input,
+ __global float * output,
+ read_only FILTER_TYPE filter,
+ int Lx, // filter number of columns,
+ int Ly, // filter number of rows,
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Lx);
+ float sum = 0.0f;
+
+ for (int jy = 0; jy <= hR+hL; jy++) {
+ for (int jx = 0; jx <= hR+hL; jx++) {
+ sum += READ_IMAGE_XY * READ_FILTER_VAL(jx, jy);
+ }
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+#endif
+
+#if IMAGE_DIMS == 3 && FILTER_DIMS == 2
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_XZ_tex(
+ read_only IMAGE_TYPE input,
+ __global float * output,
+ read_only FILTER_TYPE filter,
+ int Lx, // filter number of columns,
+ int Lz, // filter number of rows,
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Lx);
+ float sum = 0.0f;
+
+ for (int jz = 0; jz <= hR+hL; jz++) {
+ for (int jx = 0; jx <= hR+hL; jx++) {
+ sum += READ_IMAGE_XZ * READ_FILTER_VAL(jx, jz);
+ }
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+
+
+// Convolution with 2D kernel
+// Works for batched 2D on 3D.
+__kernel void convol_2D_YZ_tex(
+ read_only IMAGE_TYPE input,
+ __global float * output,
+ read_only FILTER_TYPE filter,
+ int Lx, // filter number of columns,
+ int Lz, // filter number of rows,
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Lx);
+ float sum = 0.0f;
+
+ for (int jz = 0; jz <= hR+hL; jz++) {
+ for (int jy = 0; jy <= hR+hL; jy++) {
+ sum += READ_IMAGE_YZ * READ_FILTER_VAL(jy, jz);
+ }
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+#endif
+
+
+/******************************************************************************/
+/**************************** 3D Convolution **********************************/
+/******************************************************************************/
+
+#if IMAGE_DIMS == 3 && FILTER_DIMS == 3
+// Convolution with 3D kernel
+__kernel void convol_3D_XYZ_tex(
+ read_only IMAGE_TYPE input,
+ __global float * output,
+ read_only FILTER_TYPE filter,
+ int Lx, // filter number of columns,
+ int Ly, // filter number of rows,
+ int Lz, // filter number of rows,
+ int Nx, // input/output number of columns
+ int Ny, // input/output number of rows
+ int Nz // input/output depth
+)
+{
+ uint gidx = get_global_id(0);
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
+
+ int c, hL, hR;
+ GET_CENTER_HL(Lx);
+ float sum = 0.0f;
+
+ for (int jz = 0; jz <= hR+hL; jz++) {
+ for (int jy = 0; jy <= hR+hL; jy++) {
+ for (int jx = 0; jx <= hR+hL; jx++) {
+ sum += READ_IMAGE_XYZ * READ_FILTER_VAL(jx, jy, jz);
+ }
+ }
+ }
+ output[(gidz*Ny + gidy)*Nx + gidx] = sum;
+}
+#endif
diff --git a/src/silx/resources/opencl/doubleword.cl b/src/silx/resources/opencl/doubleword.cl
new file mode 100644
index 0000000..a0ebfda
--- /dev/null
+++ b/src/silx/resources/opencl/doubleword.cl
@@ -0,0 +1,115 @@
+/*
+ * OpenCL library for double word floating point calculation using compensated arithmetics
+ *
+ * The theoritical basis can be found in Valentina Popescu's PhD thesis:
+ * Towards fast and certified multi-precision libraries
+ * Reference LYSEN036
+ * http://www.theses.fr/2017LYSEN036
+ * All page number and equation number are refering to this document.
+ *
+ * The precision of the calculation (bounds) is provided in ULP (smallest possible mantissa)
+ * and come from the table 2.2 (page 68 of the thesis).
+ * The number of equivalent FLOP is taken from the table 2.3 (page 69 the thesis).
+ * Note that FLOP are not all equal: a division is much more expensive than an addition.
+ */
+
+//This library can be expanded to double-double by redefining fp, fp2 and one to double, double2 and 1.0.
+#ifdef DOUBLEDOUBLE
+#define fp double
+#define fp2 double2
+#define one 1.0
+#else
+#define fp float
+#define fp2 float2
+#define one 1.0f
+#endif
+
+/* Nota: i386 computer use x87 registers which are larger than the 32bits precision
+ * which can invalidate the error compensation mechanism.
+ *
+ * We use the trick to declare some variable "volatile" to enforce the actual
+ * precision reduction of those variables.
+*/
+
+#ifndef X87_VOLATILE
+# define X87_VOLATILE
+#endif
+
+//Algorithm 1, p23, theorem 1.1.12. Requires e_x > e_y, valid if |x| > |y|
+inline fp2 fast_fp_plus_fp(fp x, fp y){
+ X87_VOLATILE fp s = x + y;
+ X87_VOLATILE fp z = s - x;
+ fp e = y - z;
+ return (fp2)(s, e);
+}
+
+//Algorithm 2, p24, same as fast_fp_plus_fp without the condition on e_x and e_y
+inline fp2 fp_plus_fp(fp x, fp y){
+ X87_VOLATILE fp s = x + y;
+ X87_VOLATILE fp xp = s - y;
+ X87_VOLATILE fp yp = s - xp;
+ X87_VOLATILE fp dx = x - xp;
+ X87_VOLATILE fp dy = y - yp;
+ return (fp2)(s, dx+dy);
+}
+
+//Algorithm 3, p24: multiplication with a FMA
+inline fp2 fp_times_fp(fp x, fp y){
+ fp p = x * y;
+ fp e = fma(x, y, -p);
+ return (fp2)(p, e);
+}
+
+//Algorithm 7, p38: Addition of a FP to a DW. 10flop bounds:2u²+5u³
+inline fp2 dw_plus_fp(fp2 x, fp y){
+ fp2 s = fp_plus_fp(x.s0, y);
+ X87_VOLATILE fp v = x.s1 + s.s1;
+ return fast_fp_plus_fp(s.s0, v);
+}
+
+//Algorithm 9, p40: addition of two DW: 20flop bounds:3u²+13u³
+inline fp2 dw_plus_dw(fp2 x, fp2 y){
+ fp2 s = fp_plus_fp(x.s0, y.s0);
+ fp2 t = fp_plus_fp(x.s1, y.s1);
+ fp2 v = fast_fp_plus_fp(s.s0, s.s1 + t.s0);
+ return fast_fp_plus_fp(v.s0, t.s1 + v.s1);
+}
+
+//Algorithm 12, p49: Multiplication FP*DW: 6flops bounds:2u²
+inline fp2 dw_times_fp(fp2 x, fp y){
+ fp2 c = fp_times_fp(x.s0, y);
+ return fast_fp_plus_fp(c.s0, fma(x.s1, y, c.s1));
+}
+
+//Algorithm 14, p52: Multiplication DW*DW, 8 flops bounds:6u²
+inline fp2 dw_times_dw(fp2 x, fp2 y){
+ fp2 c = fp_times_fp(x.s0, y.s0);
+ X87_VOLATILE fp l = fma(x.s1, y.s0, x.s0 * y.s1);
+ return fast_fp_plus_fp(c.s0, c.s1 + l);
+}
+
+//Algorithm 17, p55: Division DW / FP, 10flops bounds: 3.5u²
+inline fp2 dw_div_fp(fp2 x, fp y){
+ X87_VOLATILE fp th = x.s0 / y;
+ fp2 pi = fp_times_fp(th, y);
+ fp2 d = x - pi;
+ X87_VOLATILE fp delta = d.s0 + d.s1;
+ X87_VOLATILE fp tl = delta/y;
+ return fast_fp_plus_fp(th, tl);
+}
+
+//Derived from algorithm 20, p64: Inversion 1/ DW, 22 flops
+inline fp2 inv_dw(fp2 y){
+ X87_VOLATILE fp th = one/y.s0;
+ X87_VOLATILE fp rh = fma(-y.s0, th, one);
+ X87_VOLATILE fp rl = -y.s1 * th;
+ fp2 e = fast_fp_plus_fp(rh, rl);
+ fp2 delta = dw_times_fp(e, th);
+ return dw_plus_fp(delta, th);
+}
+
+//Algorithm 20, p64: Division DW / DW, 30 flops: bounds:9.8u²
+inline fp2 dw_div_dw(fp2 x, fp2 y){
+ return dw_times_dw(x, inv_dw(y));
+}
+
diff --git a/src/silx/resources/opencl/image/cast.cl b/src/silx/resources/opencl/image/cast.cl
new file mode 100644
index 0000000..9e23a82
--- /dev/null
+++ b/src/silx/resources/opencl/image/cast.cl
@@ -0,0 +1,181 @@
+/*
+ * Project: SILX: Alogorithms for image processing
+ *
+ * Copyright (C) 2013-2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NB_COLOR
+ #define NB_COLOR 1
+#endif
+
+
+/**
+ * \brief Cast values of an array of uint8 into a float output array.
+ *
+ * :param array_input: Pointer to global memory with the input data as unsigned8 array
+ * :param array_float: Pointer to global memory with the output data as float array
+ * :param width: Width of the image
+ * :param height: Height of the image
+ */
+kernel void u8_to_float( global unsigned char *array_input,
+ global float *array_float,
+ const int width,
+ const int height)
+{
+ //Global memory guard for padding
+ if ((get_global_id(0) < width) && (get_global_id(1)<height))
+ {
+ int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+ for (int c=0; c<NB_COLOR; c++)
+ {
+ array_float[i + c] = (float) array_input[i + c];
+ } //end loop over colors
+ } //end test in image
+} //end kernel
+
+/**
+ * \brief Cast values of an array of uint8 into a float output array.
+ *
+ * :param array_input: Pointer to global memory with the input data as signed8 array
+ * :param array_float: Pointer to global memory with the output data as float array
+ * :param width: Width of the image
+ * :param height: Height of the image
+ */
+kernel void s8_to_float( global char *array_input,
+ global float *array_float,
+ const int width,
+ const int height)
+{
+ //Global memory guard for padding
+ if ((get_global_id(0) < width) && (get_global_id(1)<height))
+ {
+ int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+ for (int c=0; c<NB_COLOR; c++)
+ {
+ array_float[i + c] = (float) array_input[i + c];
+ } //end loop over colors
+ } //end test in image
+} //end kernel
+
+
+/**
+ * \brief cast values of an array of uint16 into a float output array.
+ *
+ * :param array_input: Pointer to global memory with the input data as unsigned16 array
+ * :param array_float: Pointer to global memory with the output data as float array
+ * :param width: Width of the image
+ * :param height: Height of the image
+ */
+kernel void u16_to_float(global unsigned short *array_input,
+ global float *array_float,
+ const int width,
+ const int height)
+{
+ //Global memory guard for padding
+ if ((get_global_id(0) < width) && (get_global_id(1)<height))
+ {
+ int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+ for (int c=0; c<NB_COLOR; c++)
+ {
+ array_float[i + c] = (float) array_input[i + c];
+ } //end loop over colors
+ } //end test in image
+} //end kernel
+
+/**
+ * \brief cast values of an array of int16 into a float output array.
+ *
+ * :param array_input: Pointer to global memory with the input data as signed16 array
+ * :param array_float: Pointer to global memory with the output data as float array
+ * :param width: Width of the image
+ * :param height: Height of the image
+ */
+kernel void s16_to_float(global short *array_input,
+ global float *array_float,
+ const int width,
+ const int height)
+{
+ //Global memory guard for padding
+ if ((get_global_id(0) < width) && (get_global_id(1)<height))
+ {
+ int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+ for (int c=0; c<NB_COLOR; c++)
+ {
+ array_float[i + c] = (float) array_input[i + c];
+ } //end loop over colors
+ } //end test in image
+}//end kernel
+
+/**
+ * \brief cast values of an array of uint32 into a float output array.
+ *
+ * :param array_input: Pointer to global memory with the input data as unsigned32 array
+ * :param array_float: Pointer to global memory with the output data as float array
+ * :param width: Width of the image
+ * :param height: Height of the image
+ */
+kernel void u32_to_float(global unsigned int *array_input,
+ global float *array_float,
+ const int width,
+ const int height)
+{
+ //Global memory guard for padding
+ if ((get_global_id(0) < width) && (get_global_id(1)<height))
+ {
+ int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+ for (int c=0; c<NB_COLOR; c++)
+ {
+ array_float[i + c] = (float) array_input[i + c];
+ } //end loop over colors
+ } //end test in image
+}//end kernel
+
+
+/**
+ * \brief convert values of an array of int32 into a float output array.
+ *
+ * :param array_input: Pointer to global memory with the data in int
+ * :param array_float: Pointer to global memory with the data in float
+ * :param width: Width of the image
+ * :param height: Height of the image
+ */
+kernel void s32_to_float(global int *array_input,
+ global float *array_float,
+ const int width,
+ const int height)
+{
+ //Global memory guard for padding
+ if ((get_global_id(0) < width) && (get_global_id(1)<height))
+ {
+ int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
+ for (int c=0; c<NB_COLOR; c++)
+ {
+ array_float[i + c] = (float) array_input[i + c];
+ } //end loop over colors
+ } //end test in image
+}//end kernel
+
diff --git a/src/silx/resources/opencl/image/histogram.cl b/src/silx/resources/opencl/image/histogram.cl
new file mode 100644
index 0000000..7fb1485
--- /dev/null
+++ b/src/silx/resources/opencl/image/histogram.cl
@@ -0,0 +1,178 @@
+//CL//
+
+/*
+ * Project: SILX: Alogorithms for image processing
+ *
+ * Copyright (C) 2013-2018 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* Single kernel histogram for float array.
+ *
+ * Can perform histograming in log-scale (using the arcsinh)
+
+Parameters:
+ - data: buffer with the image content in float (input)
+ - data_size: input
+ - mini: Lower bound of the first bin
+ - maxi: upper bouns of the last bin
+ - map_operation: Type of pre-processor to use. if if set to !=0, use log scale
+ - hist: resulting histogram (ouptut)
+ - hist_size: number of bins
+ - tmp_hist: temporary storage of size hist_size*num_groups
+ - processed: temporary storage of size 1 initialially set to 0
+ - local_hist: local storage of size hist_size
+
+
+Grid information:
+ * use the largest WG size. If it is larger than the number of bins (hist_size),
+ take the power of 2 immediately above
+ *Schedule as many WG as the device has compute engines. No need to schuedule more,
+ it is just a waist of memory
+ * The histogram should fit in shared (local) memory and tmp_hist can be large!
+
+Assumes:
+ hist and local_hist have size hist_size
+ edges has size hist_size+2
+ tmp_hist has size hist_size*num_groups
+ processed is of size one and initially set to 0
+
+*/
+
+
+static float preprocess(float value, int code)
+{
+ //This function can be modified to have more scales
+ if (code!=0)
+ {
+ return asinh(value);
+ }
+ else
+ {
+ return value;
+ }
+}
+
+kernel void histogram(global float *data,
+ int data_size,
+ float mini,
+ float maxi,
+ int map_operation,
+ global int *hist,
+ global float *edges,
+ int hist_size,
+ global int *tmp_hist,
+ global int *processed,
+ local int *local_hist)
+{
+ // each thread
+ int lid = get_local_id(0);
+ int ws = get_local_size(0);
+ int nb_engine = get_num_groups(0);
+ int engine_id = get_group_id(0);
+
+ // memset the local_hist array
+ for (int i=0; i<hist_size; i+=ws)
+ {
+ int j = i + lid;
+ if (j<hist_size)
+ {
+ local_hist[j] = 0;
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Process the local data and accumulate in shared memory
+ int bloc_size = (int) ceil((float)data_size/(float)nb_engine);
+ int start = bloc_size * engine_id;
+ int stop = min(start + bloc_size, data_size);
+ float vmini = preprocess(mini, map_operation);
+ float vscale = (float)hist_size/(preprocess(maxi, map_operation) -vmini);
+ for (int i = start; i<stop; i+=ws)
+ {
+ int address = i + lid;
+ if (address < stop)
+ {
+ float value = data[address];
+ if ((!isnan(value)) && (value>=mini) && (value<=maxi))
+ {
+ float vvalue = (preprocess(value, map_operation)-vmini)*vscale;
+ int idx = clamp((int) vvalue, 0, hist_size - 1);
+ atomic_inc(&local_hist[idx]);
+ }
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //Now copy result into the right place and reset the first value of the shared array
+ for (int i=0; i<hist_size; i+=ws)
+ {
+ int j = i + lid;
+ if (j<hist_size)
+ {
+ tmp_hist[hist_size * engine_id + j] = local_hist[j];
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ local_hist[0] = 0;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //Increment the system wide shared variable processed and share the result with the workgroup
+ if (lid == 0)
+ {
+ local_hist[0] = atomic_inc(processed);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // If we are the engine last to work, perform the concatenation of all results
+
+ if ((local_hist[0] + 1) == nb_engine)
+ {
+ for (int i=0; i<hist_size; i+=ws)
+ {
+ int j = i + lid;
+ int lsum = 0;
+ if (j<hist_size)
+ {
+ for (int k=0; k<nb_engine; k++)
+ {
+ lsum += tmp_hist[hist_size * k + j];
+ }
+ hist[j] = lsum;
+ edges[j] = vmini + j/vscale;
+ }
+ }
+ // Finally reset the counter
+ if (lid == 0)
+ {
+ processed[0] = 0;
+ edges[hist_size] = vmini + hist_size/vscale;;
+ }
+
+ }
+}
diff --git a/src/silx/resources/opencl/image/map.cl b/src/silx/resources/opencl/image/map.cl
new file mode 100644
index 0000000..804a5a1
--- /dev/null
+++ b/src/silx/resources/opencl/image/map.cl
@@ -0,0 +1,85 @@
+/*
+ * Project: SILX: Data analysis library
+ * kernel for maximum and minimum calculation
+ *
+ *
+ * Copyright (C) 2013-2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ */
+
+#ifndef NB_COLOR
+ #define NB_COLOR 1
+#endif
+
+
+/**
+ * \brief Linear scale of signal in an image (maybe multi-color)
+ *
+ * :param image: contains the input image
+ * :param output: contains the output image after scaling
+ * :param max_min_input: 2-floats containing the maximum and the minimum value of image
+ * :param minimum: float scalar with the minimum of the output
+ * :param maximum: float scalar with the maximum of the output
+ * :param width: integer, number of columns the matrices
+ * :param height: integer, number of lines of the matrices
+ *
+ *
+ */
+
+
+kernel void normalize_image(global float* image,
+ global float* output,
+ int width,
+ int height,
+ global float* max_min_input,
+ float minimum,
+ float maximum
+ )
+{
+ //Global memory guard for padding
+ if((get_global_id(0) < width) && (get_global_id(1)<height))
+ {
+ int idx = NB_COLOR* (get_global_id(0) + width * get_global_id(1));
+ float mini_in, maxi_in, scale;
+ maxi_in = max_min_input[0];
+ mini_in = max_min_input[1];
+ if (maxi_in == mini_in)
+ {
+ scale = NAN;
+ }
+ else
+ {
+ scale = (maximum - minimum) / (maxi_in - mini_in);
+ }
+
+ for (int c=0; c<NB_COLOR; c++)
+ {
+ output[idx+c] = fma(scale, image[idx+c]-mini_in, minimum);
+ } // end color loop
+ }//end if in IMAGE
+}//end kernel
diff --git a/src/silx/resources/opencl/image/max_min.cl b/src/silx/resources/opencl/image/max_min.cl
new file mode 100644
index 0000000..246cd48
--- /dev/null
+++ b/src/silx/resources/opencl/image/max_min.cl
@@ -0,0 +1,207 @@
+/*
+ * Project: SILX: Data analysis library
+ * kernel for maximum and minimum calculation
+ *
+ *
+ * Copyright (C) 2013-2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ */
+
+#ifndef NB_COLOR
+ #define NB_COLOR 1
+#endif
+
+
+
+#define REDUCE(a, b) ((float2) (fmax(a.x, b.x), fmin(a.y, b.y)))
+
+static float2 read_and_map(int idx,
+ global float* data)
+{
+ idx *= NB_COLOR;
+ float tmp = data[idx];
+ float2 res = (float2) (tmp, tmp);
+ if (NB_COLOR > 1)
+ {
+ for (int c=1; c<NB_COLOR; c++)
+ {
+ tmp = data[idx + c];
+ res = (float2) (fmax(res.x, tmp), fmin(res.y, tmp));
+ }
+ }
+ return res;
+}
+
+
+/**
+ * \brief max_min_global_stage1: Look for the maximum an the minimum of an array. stage1
+ *
+ * optimal workgroup size: 2^n greater than sqrt(size), limited to 512
+ * optimal total item size: (workgroup size)^2
+ * if size >total item size: adjust seq_count.
+ *
+ * :param data: Float pointer to global memory storing the vector of data.
+ * :param out: Float2 pointer to global memory storing the temporary results (workgroup size)
+ * :param seq_count: how many blocksize each thread should read
+ * :param size: size of the problem (number of element in the image
+ * :param l_data Shared memory: 2 float per thread in workgroup
+ *
+**/
+
+
+kernel void max_min_reduction_stage1( global const float *data,
+ global float2 *out,
+ int size,
+ local float2 *l_data)// local storage 2 float per thread
+{
+ int group_size = get_local_size(0);
+ int lid = get_local_id(0);
+ float2 acc;
+ int big_block = group_size * get_num_groups(0);
+ int i = lid + group_size * get_group_id(0);
+
+ if (lid<size)
+ acc = read_and_map(lid, data);
+ else
+ acc = read_and_map(0, data);
+
+ // Linear pre-reduction stage 0
+
+ while (i<size){
+ acc = REDUCE(acc, read_and_map(i, data));
+ i += big_block;
+ }
+
+ // parallel reduction stage 1
+
+ l_data[lid] = acc;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int block=group_size/2; block>1; block/=2)
+ {
+ if ((lid < block) && ((lid + block)<group_size)){
+ l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ if (lid == 0)
+ {
+ if (group_size > 1)
+ {
+ acc = REDUCE(l_data[0], l_data[1]);
+ }
+ else
+ {
+ acc = l_data[0];
+ }
+ out[get_group_id(0)] = acc;
+ }
+}
+
+
+/**
+ * \brief global_max_min: Look for the maximum an the minimum of an array.
+ *
+ *
+ *
+ * :param data2: Float2 pointer to global memory storing the vector of pre-reduced data (workgroup size).
+ * :param maximum: Float pointer to global memory storing the maximum value
+ * :param minumum: Float pointer to global memory storing the minimum value
+ * :param l_data Shared memory: 2 float per thread in workgroup
+ *
+**/
+
+kernel void max_min_reduction_stage2(
+ global const float2 *data2,
+ global float2 *maxmin,
+ local float2 *l_data)// local storage 2 float per thread
+{
+ int lid = get_local_id(0);
+ int group_size = get_local_size(0);
+ float2 acc = (float2)(-1.0f, -1.0f);
+ if (lid<=group_size)
+ {
+ l_data[lid] = data2[lid];
+ }
+ else
+ {
+ l_data[lid] = acc;
+ }
+
+ // parallel reduction stage 2
+
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int block=group_size/2; block>1; block/=2)
+ {
+ if ((lid < block) && ((lid + block)<group_size))
+ {
+ l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ }
+
+ if (lid == 0 )
+ {
+ if ( group_size > 1)
+ {
+ acc = REDUCE(l_data[0], l_data[1]);
+ }
+ else
+ {
+ acc = l_data[0];
+ }
+ maxmin[0] = acc;
+ }
+}
+
+/*This is the serial version of the min_max kernel.
+ *
+ * It has to be launched with WG=1 and only 1 WG has to be launched !
+ *
+ * :param data: Float pointer to global memory storing the vector of data.
+ * :param size: size of the
+ * :param maximum: Float pointer to global memory storing the maximum value
+ * :param minumum: Float pointer to global memory storing the minimum value
+ *
+ *
+ */
+kernel void max_min_serial(
+ global const float *data,
+ unsigned int size,
+ global float2 *maxmin
+ )
+{
+ float2 acc = read_and_map(0, data);
+ for (int i=1; i<size; i++)
+ {
+ acc = REDUCE(acc, read_and_map(i, data));
+ }
+
+ maxmin[0] = acc;
+}
diff --git a/src/silx/resources/opencl/kahan.cl b/src/silx/resources/opencl/kahan.cl
new file mode 100644
index 0000000..c23d84d
--- /dev/null
+++ b/src/silx/resources/opencl/kahan.cl
@@ -0,0 +1,143 @@
+/*
+ * OpenCL library for 32-bits floating point calculation using compensated arithmetics
+ */
+
+/* Nota: i386 computer use x87 registers which are larger than the 32bits precision
+ * which can invalidate the error compensation mechanism.
+ *
+ * We use the trick to declare some variable "volatile" to enforce the actual
+ * precision reduction of those variables.
+*/
+
+#ifndef X87_VOLATILE
+# define X87_VOLATILE
+#endif
+
+// calculate acc.s0+value with error compensation
+// see https://en.wikipedia.org/wiki/Kahan_summation_algorithm
+// unlike wikipedia, here the exact value = acc.s0 + acc.s1 (performed in higher precision)
+
+static inline float2 kahan_sum(float2 acc, float value)
+{
+ if (value)
+ {
+ float sum = acc.s0;
+ float err = acc.s1;
+ if (fabs(value) > fabs(sum))
+ {
+ float tmp = sum;
+ sum = value;
+ value = tmp;
+ }
+
+ float cor = value + err;
+ X87_VOLATILE float target = sum + cor;
+ err = cor - (target - sum);
+ return (float2)(target, err);
+ }
+ else
+ {
+ return acc;
+ }
+}
+
+// calculate a*b with error compensation
+// see https://hal.archives-ouvertes.fr/hal-01367769/document
+static inline float2 comp_prod(float a, float b)
+{
+ X87_VOLATILE float x = a * b;
+ float y = fma(a, b, -x);
+ return (float2)(x, y);
+}
+
+// calculate a + b with error compensation
+static inline float2 compensated_sum(float2 a, float2 b)
+{
+ float err = a.s1 + b.s1;
+ float first = a.s0;
+ float second = b.s0;
+ if (fabs(second) > fabs(first))
+ {//swap first and second
+ float tmp = first;
+ first = second;
+ second = tmp;
+ }
+ float cor = second + err;
+ X87_VOLATILE float target = first + cor;
+ err = cor - (target - first);
+ return (float2)(target, err);
+}
+
+// calculate a * b with error compensation
+static inline float2 compensated_mul(float2 a, float2 b)
+{
+ float2 tmp;
+ tmp = kahan_sum((float2)(a.s0*b.s0, a.s0*b.s1), a.s1*b.s0);
+ tmp = kahan_sum(tmp, a.s1*b.s1);
+ return tmp;
+}
+
+// calculate 1/a with error compensation (Needs validation)
+static inline float2 compensated_inv(float2 a)
+{
+ float main = a.s0;
+ float err = a.s1;
+ return (float2)(1.0f/main, -err/(main*main));
+}
+
+// calculate a/b with error compensation (Needs validation)
+static inline float2 compensated_div(float2 a, float2 b)
+{
+ float ah = a.s0;
+ float al = a.s1;
+ float bh = b.s0;
+ float bl = b.s1;
+ float bl_over_bh = bl/bh;
+ return kahan_sum(kahan_sum(a, -a.s1 * bl_over_bh), -a.s0 * bl_over_bh) / bh;
+}
+
+
+// calculate a.b where a and b are float2
+static inline float2 comp_dot2(float2 a, float2 b)
+{
+ return compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1));
+}
+
+// calculate a.b where a and b are float3
+static inline float2 comp_dot3(float3 a, float3 b)
+{
+ return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), comp_prod(a.s2, b.s2));
+}
+
+// calculate a.b where a and b are float4
+static inline float2 comp_dot4(float4 a, float4 b)
+{
+ return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
+ compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3)));
+}
+
+// calculate a.b where a and b are float8
+static inline float2 comp_dot8(float8 a, float8 b)
+{
+ return compensated_sum(
+ compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
+ compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))),
+ compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)),
+ compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7))));
+}
+
+// calculate a.b where a and b are float8
+static inline float2 comp_dot16(float16 a, float16 b)
+{
+ return compensated_sum(
+ compensated_sum(
+ compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
+ compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))),
+ compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)),
+ compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7)))),
+ compensated_sum(
+ compensated_sum(compensated_sum(comp_prod(a.s8, b.s8), comp_prod(a.s9, b.s9)),
+ compensated_sum(comp_prod(a.sa, b.sa), comp_prod(a.sb, b.sb))),
+ compensated_sum(compensated_sum(comp_prod(a.sc, b.sc), comp_prod(a.sd, b.sd)),
+ compensated_sum(comp_prod(a.se, b.se), comp_prod(a.sf, b.sf)))));
+}
diff --git a/src/silx/resources/opencl/linalg.cl b/src/silx/resources/opencl/linalg.cl
new file mode 100644
index 0000000..8710528
--- /dev/null
+++ b/src/silx/resources/opencl/linalg.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+/**
+ *
+ * Compute the spatial gradient of an image.
+ *
+ * slice: input image
+ * slice_grad: output gradient
+ * sizeX: number of columns of the image
+ * sizeY: number of rows of the image
+ *
+ **/
+kernel void kern_gradient2D(
+ global float* slice,
+ global float2* slice_grad,
+ int sizeX,
+ int sizeY)
+{
+
+ int gidx = (int) get_global_id(0);
+ int gidy = (int) get_global_id(1);
+
+ if ((gidx < sizeX) && (gidy < sizeY))
+ {
+ // Note the direction inconstancy ! (JK 07/2018)
+
+ float val_y = (gidx == (sizeX-1))? 0: slice[gidy*sizeX+gidx+1] - slice[gidy*sizeX+gidx];
+ float val_x = (gidy == (sizeY-1))? 0: slice[(gidy+1)*sizeX+gidx] - slice[(gidy)*sizeX+gidx];
+
+ slice_grad[gidy*sizeX+gidx].x = val_x;
+ slice_grad[gidy*sizeX+gidx].y = val_y;
+ }
+}
+
+/**
+ *
+ * Compute the spatial divergence of an image gradient.
+ *
+ * slice_grad: input gradient-like image
+ * slice: output image
+ * sizeX: number of columns of the input
+ * sizeY: number of rows of the input
+ *
+ **/
+kernel void kern_divergence2D(
+ global float2* slice_grad,
+ global float* slice,
+ int sizeX,
+ int sizeY)
+{
+ int gidx = (int) get_global_id(0);
+ int gidy = (int) get_global_id(1);
+
+ if (gidx < sizeX && gidy < sizeY)
+ {
+ float val_x, val_y;
+ val_y = (gidx == 0)?
+ slice_grad[(gidy)*sizeX+gidx].y :
+ slice_grad[(gidy)*sizeX+gidx].y - slice_grad[(gidy)*sizeX+gidx-1].y;
+ val_x = (gidy == 0)?
+ slice_grad[(gidy)*sizeX+gidx].x:
+ slice_grad[(gidy)*sizeX+gidx].x - slice_grad[(gidy-1)*sizeX+gidx].x;
+ slice[gidy*sizeX+gidx] = val_x + val_y;
+ }
+}
diff --git a/src/silx/resources/opencl/medfilt.cl b/src/silx/resources/opencl/medfilt.cl
new file mode 100644
index 0000000..0680230
--- /dev/null
+++ b/src/silx/resources/opencl/medfilt.cl
@@ -0,0 +1,141 @@
+/*
+ * Project: Azimuthal regroupping OpenCL kernel for PyFAI.
+ * Median filter for 1D, 2D and 3D datasets, only 2D for now
+ *
+ *
+ * Copyright (C) 2017-2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: J. Kieffer (kieffer@esrf.fr)
+ * Last revision: 07/02/2017
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+/*
+ * Needs to be concatenated with bitonic.cl prior to compilation
+*/
+
+/*
+ * Perform the 2D median filtering of an image 2D image
+ *
+ * dim0 => wg=number_of_element in the tile /8
+ * dim1 = x: wg=1
+
+ *
+ * Actually the workgoup size is a bit more complicated:
+ * if window size = 1,3,5,7: WG1=8
+ * if window size = 9,11,13,15: WG1=32
+ * if window size = 17, ...,21: WG1=64
+ *
+ * More Generally the workgroup size must be: at least: kfs2*(kfs1+7)/8
+ * Each thread treats 8 values aligned vertically, this allows (almost)
+ * coalesced reading between threads in one line of the tile.
+ *
+ * Later on it will be more efficient to re-use the same tile
+ * and slide it vertically by one line.
+ * The additionnal need for shared memory will be kfs2 floats and a float8 as register.
+ *
+ * Theoritically, it should be possible to handle up to windows-size 83x83
+ */
+__kernel void medfilt2d(__global float *image, // input image
+ __global float *result, // output array
+ __local float4 *l_data,// local storage 4x the number of threads
+ int khs1, // Kernel half-size along dim1 (nb lines)
+ int khs2, // Kernel half-size along dim2 (nb columns)
+ int height, // Image size along dim1 (nb lines)
+ int width) // Image size along dim2 (nb columns)
+{
+ int threadid = get_local_id(0);
+ //int wg = get_local_size(0);
+ int x = get_global_id(1);
+
+ if (x < width)
+ {
+ union
+ {
+ float ary[8];
+ float8 vec;
+ } output, input;
+ input.vec = (float8)(MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT);
+ int kfs1 = 2 * khs1 + 1; //definition of kernel full size
+ int kfs2 = 2 * khs2 + 1;
+ int nbands = (kfs1 + 7) / 8; // 8 elements per thread, aligned vertically in 1 column
+ for (int y=0; y<height; y++)
+ {
+ //Select only the active threads, some may remain inactive
+ int nb_threads = (nbands * kfs2);
+ int band_nr = threadid / kfs2;
+ int band_id = threadid % kfs2;
+ int pos_x = clamp((int)(x + band_id - khs2), (int) 0, (int) width-1);
+ int max_vec = clamp(kfs1 - 8 * band_nr, 0, 8);
+ if (y == 0)
+ {
+ for (int i=0; i<max_vec; i++)
+ {
+ if (threadid<nb_threads)
+ {
+ int pos_y = clamp((int)(y + 8 * band_nr + i - khs1), (int) 0, (int) height-1);
+ input.ary[i] = image[pos_x + width * pos_y];
+ }
+ }
+ }
+ else
+ {
+ //store storage.s0 to some shared memory to retrieve it from another thread.
+ l_data[threadid].s0 = input.vec.s0;
+
+ //Offset to the bottom
+ input.vec = (float8)(input.vec.s1,
+ input.vec.s2,
+ input.vec.s3,
+ input.vec.s4,
+ input.vec.s5,
+ input.vec.s6,
+ input.vec.s7,
+ MAXFLOAT);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int read_from = threadid + kfs2;
+ if (read_from < nb_threads)
+ input.vec.s7 = l_data[read_from].s0;
+ else if (threadid < nb_threads) //we are on the last band
+ {
+ int pos_y = clamp((int)(y + 8 * band_nr + max_vec - 1 - khs1), (int) 0, (int) height-1);
+ input.ary[max_vec - 1] = image[pos_x + width * pos_y];
+ }
+
+ }
+
+ //This function is defined in bitonic.cl
+ output.vec = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
+ input.vec, l_data);
+
+ size_t target = (kfs1 * kfs2) / 2;
+ if (threadid == (target / 8))
+ {
+ result[y * width + x] = output.ary[target % 8];
+ }
+
+ }
+ }
+}
+
diff --git a/src/silx/resources/opencl/preprocess.cl b/src/silx/resources/opencl/preprocess.cl
new file mode 100644
index 0000000..de35c48
--- /dev/null
+++ b/src/silx/resources/opencl/preprocess.cl
@@ -0,0 +1,567 @@
+/*
+ * Project: Azimuthal regroupping OpenCL kernel for PyFAI.
+ * Preprocessing program
+ *
+ *
+ * Copyright (C) 2012-2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: J. Kieffer (kieffer@esrf.fr)
+ * Last revision: 19/01/2017
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/**
+ * \file
+ *
+ * \brief OpenCL kernels for image array casting, array mem-setting and normalizing
+ *
+ * Constant to be provided at build time:
+ * NIMAGE: size of the image
+ *
+ */
+
+#include "for_eclipse.h"
+
+/**
+ * \brief cast values of an array of int8 into a float output array.
+ *
+ * - array_s8: Pointer to global memory with the input data as signed8 array
+ * - array_float: Pointer to global memory with the output data as float array
+ */
+__kernel void
+s8_to_float(__global char *array_s8,
+ __global float *array_float
+)
+{
+ int i = get_global_id(0);
+ //Global memory guard for padding
+ if (i < NIMAGE)
+ array_float[i] = (float)array_s8[i];
+}
+
+
+/**
+ * \brief cast values of an array of uint8 into a float output array.
+ *
+ * - array_u8: Pointer to global memory with the input data as unsigned8 array
+ * - array_float: Pointer to global memory with the output data as float array
+ */
+__kernel void
+u8_to_float(__global unsigned char *array_u8,
+ __global float *array_float
+)
+{
+ int i = get_global_id(0);
+ //Global memory guard for padding
+ if (i < NIMAGE)
+ array_float[i] = (float)array_u8[i];
+}
+
+
+/**
+ * \brief cast values of an array of int16 into a float output array.
+ *
+ * - array_s16: Pointer to global memory with the input data as signed16 array
+ * - array_float: Pointer to global memory with the output data as float array
+ */
+__kernel void
+s16_to_float(__global short *array_s16,
+ __global float *array_float
+)
+{
+ int i = get_global_id(0);
+ //Global memory guard for padding
+ if (i < NIMAGE)
+ array_float[i] = (float)array_s16[i];
+}
+
+
+/**
+ * \brief cast values of an array of uint16 into a float output array.
+ *
+ * - array_u16: Pointer to global memory with the input data as unsigned16 array
+ * - array_float: Pointer to global memory with the output data as float array
+ */
+__kernel void
+u16_to_float(__global unsigned short *array_u16,
+ __global float *array_float
+)
+{
+ int i = get_global_id(0);
+ //Global memory guard for padding
+ if (i < NIMAGE)
+ array_float[i] = (float)array_u16[i];
+}
+
+/**
+ * \brief cast values of an array of uint32 into a float output array.
+ *
+ * - array_u32: Pointer to global memory with the input data as unsigned32 array
+ * - array_float: Pointer to global memory with the output data as float array
+ */
+__kernel void
+u32_to_float(__global unsigned int *array_u32,
+ __global float *array_float
+)
+{
+ int i = get_global_id(0);
+ //Global memory guard for padding
+ if (i < NIMAGE)
+ array_float[i] = (float)array_u32[i];
+}
+
+/**
+ * \brief convert values of an array of int32 into a float output array.
+ *
+ * - array_int: Pointer to global memory with the data as unsigned32 array
+ * - array_float: Pointer to global memory with the data float array
+ */
+__kernel void
+s32_to_float(__global int *array_int,
+ __global float *array_float
+)
+{
+ int i = get_global_id(0);
+ //Global memory guard for padding
+ if (i < NIMAGE)
+ array_float[i] = (float)(array_int[i]);
+}
+
+
+/**
+ * Functions to be called from an actual kernel.
+ * \brief Performs the normalization of input image by dark subtraction,
+ * variance is propagated to second member of the float3
+ * flatfield, solid angle, polarization and absorption are stored in
+ * third member of the float3 returned.
+ *
+ * Invalid/Dummy pixels will all have the third-member set to 0, i.e. no weight
+ *
+ * - image Float pointer to global memory storing the input image.
+ * - do_dark Bool/int: shall dark-current correction be applied ?
+ * - dark Float pointer to global memory storing the dark image.
+ * - do_flat Bool/int: shall flat-field correction be applied ?
+ * - flat Float pointer to global memory storing the flat image.
+ * - do_solidangle Bool/int: shall flat-field correction be applied ?
+ * - solidangle Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall polarization correction be applied ?
+ * - polarization Float pointer to global memory storing the polarization of each pixel.
+ * - do_absorption Bool/int: shall absorption correction be applied ?
+ * - absorption Float pointer to global memory storing the effective absoption of each pixel.
+ * - do_mask perform mask correction ?
+ * - mask Bool/char pointer to mask array
+ * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy Float: value for bad pixels
+ * - delta_dummy Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+**/
+
+static float3 _preproc3(const __global float *image,
+ const __global float *variance,
+ const char do_dark,
+ const __global float *dark,
+ const char do_dark_variance,
+ const __global float *dark_variance,
+ const char do_flat,
+ const __global float *flat,
+ const char do_solidangle,
+ const __global float *solidangle,
+ const char do_polarization,
+ const __global float *polarization,
+ const char do_absorption,
+ const __global float *absorption,
+ const char do_mask,
+ const __global char *mask,
+ const char do_dummy,
+ const float dummy,
+ const float delta_dummy,
+ const float normalization_factor)
+{
+ size_t i= get_global_id(0);
+ float3 result = (float3)(0.0, 0.0, 0.0);
+ if (i < NIMAGE)
+ {
+ if ((!do_mask) || (!mask[i]))
+ {
+ result.s0 = image[i];
+ if (variance != 0)
+ result.s1 = variance[i];
+ result.s2 = normalization_factor;
+ if ( (!do_dummy)
+ ||((delta_dummy != 0.0f) && (fabs(result.s0-dummy) > delta_dummy))
+ ||((delta_dummy == 0.0f) && (result.s0 != dummy)))
+ {
+ if (do_dark)
+ result.s0 -= dark[i];
+ if (do_dark_variance)
+ result.s1 += dark_variance[i];
+ if (do_flat)
+ {
+ float one_flat = flat[i];
+ if ( (!do_dummy)
+ ||((delta_dummy != 0.0f) && (fabs(one_flat-dummy) > delta_dummy))
+ ||((delta_dummy == 0.0f) && (one_flat != dummy)))
+ result.s2 *= one_flat;
+ else
+ result.s2 = 0.0f;
+ }
+ if (do_solidangle)
+ result.s2 *= solidangle[i];
+ if (do_polarization)
+ result.s2 *= polarization[i];
+ if (do_absorption)
+ result.s2 *= absorption[i];
+ if (isnan(result.s0) || isnan(result.s1) || isnan(result.s2) || (result.s2 == 0.0f))
+ result = (float3)(0.0, 0.0, 0.0);
+ }
+ else
+ {
+ result = (float3)(0.0, 0.0, 0.0);
+ }//end if do_dummy
+ } // end if mask
+ };//end if NIMAGE
+ return result;
+};//end function
+
+
+/**
+ * \brief Performs the normalization of input image by dark subtraction,
+ * flatfield, solid angle, polarization and absorption division.
+ *
+ * Intensities of images are corrected by:
+ * - dark (read-out) noise subtraction
+ * - Solid angle correction (division)
+ * - polarization correction (division)
+ * - flat fiels correction (division)
+ * Corrections are made in place unless the pixel is dummy.
+ * Dummy pixels are left untouched so that they remain dummy
+ *
+ * - image Float pointer to global memory storing the input image.
+ * - do_dark Bool/int: shall dark-current correction be applied ?
+ * - dark Float pointer to global memory storing the dark image.
+ * - do_flat Bool/int: shall flat-field correction be applied ?
+ * - flat Float pointer to global memory storing the flat image.
+ * - do_solidangle Bool/int: shall flat-field correction be applied ?
+ * - solidangle Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall polarization correction be applied ?
+ * - polarization Float pointer to global memory storing the polarization of each pixel.
+ * - do_absorption Bool/int: shall absorption correction be applied ?
+ * - absorption Float pointer to global memory storing the effective absoption of each pixel.
+ * - do_mask perform mask correction ?
+ * - mask Bool/char pointer to mask array
+ * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy Float: value for bad pixels
+ * - delta_dummy Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+**/
+
+__kernel void
+corrections(const __global float *image,
+ const char do_dark,
+ const __global float *dark,
+ const char do_flat,
+ const __global float *flat,
+ const char do_solidangle,
+ const __global float *solidangle,
+ const char do_polarization,
+ const __global float *polarization,
+ const char do_absorption,
+ const __global float *absorption,
+ const char do_mask,
+ const __global char *mask,
+ const char do_dummy,
+ const float dummy,
+ const float delta_dummy,
+ const float normalization_factor,
+ __global float *output
+ )
+{
+ size_t i= get_global_id(0);
+ float3 result = (float3)(0.0, 0.0, 0.0);
+ if (i < NIMAGE)
+ {
+ result = _preproc3(image,
+ 0,
+ do_dark,
+ dark,
+ 0,
+ 0,
+ do_flat,
+ flat,
+ do_solidangle,
+ solidangle,
+ do_polarization,
+ polarization,
+ do_absorption,
+ absorption,
+ do_mask,
+ mask,
+ do_dummy,
+ dummy,
+ delta_dummy,
+ normalization_factor);
+ if (result.s2 != 0.0f)
+ output[i] = result.s0 / result.s2;
+ else
+ output[i] = dummy;
+ };//end if NIMAGE
+
+};//end kernel
+
+
+/**
+ * \brief Performs Normalization of input image with float2 output (num,denom)
+ *
+ * Intensities of images are corrected by:
+ * - dark (read-out) noise subtraction for the data
+ * - Solid angle correction (denominator)
+ * - polarization correction (denominator)
+ * - flat fiels correction (denominator)
+ *
+ * Corrections are made out of place.
+ * Dummy pixels set both the numerator and denominator to 0
+ *
+ * - image Float pointer to global memory storing the input image.
+ * - do_dark Bool/int: shall dark-current correction be applied ?
+ * - dark Float pointer to global memory storing the dark image.
+ * - do_flat Bool/int: shall flat-field correction be applied ?
+ * - flat Float pointer to global memory storing the flat image.
+ * - do_solidangle Bool/int: shall flat-field correction be applied ?
+ * - solidangle Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall flat-field correction be applied ?
+ * - polarization Float pointer to global memory storing the polarization of each pixel.
+ * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy Float: value for bad pixels
+ * - delta_dummy Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+ *
+**/
+__kernel void
+corrections2(const __global float *image,
+ const char do_dark,
+ const __global float *dark,
+ const char do_flat,
+ const __global float *flat,
+ const char do_solidangle,
+ const __global float *solidangle,
+ const char do_polarization,
+ const __global float *polarization,
+ const char do_absorption,
+ const __global float *absorption,
+ const char do_mask,
+ const __global char *mask,
+ const char do_dummy,
+ const float dummy,
+ const float delta_dummy,
+ const float normalization_factor,
+ __global float2 *output
+ )
+{
+ size_t i= get_global_id(0);
+ float3 result = (float3)(0.0, 0.0, 0.0);
+ if (i < NIMAGE)
+ {
+ result = _preproc3(image,
+ 0,
+ do_dark,
+ dark,
+ 0,
+ 0,
+ do_flat,
+ flat,
+ do_solidangle,
+ solidangle,
+ do_polarization,
+ polarization,
+ do_absorption,
+ absorption,
+ do_mask,
+ mask,
+ do_dummy,
+ dummy,
+ delta_dummy,
+ normalization_factor);
+ output[i] = (float2)(result.s0, result.s2);
+ };//end if NIMAGE
+};//end kernel
+
+/**
+ * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) assuming poissonian signal
+ *
+ * Intensities of images are corrected by:
+ * - dark (read-out) noise subtraction for the data
+ * - Solid angle correction (denominator)
+ * - polarization correction (denominator)
+ * - flat fiels correction (denominator)
+ *
+ * Corrections are made out of place.
+ * Dummy pixels set both the numerator and denominator to 0
+ *
+ * - image Float pointer to global memory storing the input image.
+ * - do_dark Bool/int: shall dark-current correction be applied ?
+ * - dark Float pointer to global memory storing the dark image.
+ * - do_flat Bool/int: shall flat-field correction be applied ?
+ * - flat Float pointer to global memory storing the flat image.
+ * - do_solidangle Bool/int: shall flat-field correction be applied ?
+ * - solidangle Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall flat-field correction be applied ?
+ * - polarization Float pointer to global memory storing the polarization of each pixel.
+ * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy Float: value for bad pixels
+ * - delta_dummy Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+ *
+**/
+__kernel void
+corrections3Poisson( const __global float *image,
+ const char do_dark,
+ const __global float *dark,
+ const char do_flat,
+ const __global float *flat,
+ const char do_solidangle,
+ const __global float *solidangle,
+ const char do_polarization,
+ const __global float *polarization,
+ const char do_absorption,
+ const __global float *absorption,
+ const char do_mask,
+ const __global char *mask,
+ const char do_dummy,
+ const float dummy,
+ const float delta_dummy,
+ const float normalization_factor,
+ __global float3 *output
+ )
+{
+ size_t i= get_global_id(0);
+ float3 result = (float3)(0.0, 0.0, 0.0);
+ if (i < NIMAGE)
+ {
+ result = _preproc3(image,
+ image,
+ do_dark,
+ dark,
+ do_dark,
+ dark,
+ do_flat,
+ flat,
+ do_solidangle,
+ solidangle,
+ do_polarization,
+ polarization,
+ do_absorption,
+ absorption,
+ do_mask,
+ mask,
+ do_dummy,
+ dummy,
+ delta_dummy,
+ normalization_factor);
+ output[i] = result;
+ };//end if NIMAGE
+};//end kernel
+
+
+/**
+ * \brief Performs Normalization of input image with float3 output (signal, variance, normalization)
+ *
+ * Intensities of images are corrected by:
+ * - dark (read-out) noise subtraction for the data
+ * - Solid angle correction (division)
+ * - polarization correction (division)
+ * - flat fiels correction (division)
+ * Corrections are made in place unless the pixel is dummy.
+ * Dummy pixels are left untouched so that they remain dummy
+ *
+ * - image Float pointer to global memory storing the input image.
+ * - do_dark Bool/int: shall dark-current correction be applied ?
+ * - dark Float pointer to global memory storing the dark image.
+ * - do_flat Bool/int: shall flat-field correction be applied ?
+ * - flat Float pointer to global memory storing the flat image.
+ * - do_solidangle Bool/int: shall flat-field correction be applied ?
+ * - solidangle Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall flat-field correction be applied ?
+ * - polarization Float pointer to global memory storing the polarization of each pixel.
+ * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy Float: value for bad pixels
+ * - delta_dummy Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+ *
+**/
+
+__kernel void
+corrections3(const __global float *image,
+ const __global float *variance,
+ const char do_dark,
+ const __global float *dark,
+ const char do_dark_variance,
+ const __global float *dark_variance,
+ const char do_flat,
+ const __global float *flat,
+ const char do_solidangle,
+ const __global float *solidangle,
+ const char do_polarization,
+ const __global float *polarization,
+ const char do_absorption,
+ const __global float *absorption,
+ const char do_mask,
+ const __global char *mask,
+ const char do_dummy,
+ const float dummy,
+ const float delta_dummy,
+ const float normalization_factor,
+ __global float3 *output
+ )
+{
+ size_t i= get_global_id(0);
+ float3 result = (float3)(0.0, 0.0, 0.0);
+ if (i < NIMAGE)
+ {
+ result = _preproc3( image,
+ variance,
+ do_dark,
+ dark,
+ do_dark_variance,
+ dark_variance,
+ do_flat,
+ flat,
+ do_solidangle,
+ solidangle,
+ do_polarization,
+ polarization,
+ do_absorption,
+ absorption,
+ do_mask,
+ mask,
+ do_dummy,
+ dummy,
+ delta_dummy,
+ normalization_factor);
+ output[i] = result;
+ };//end if NIMAGE
+};//end kernel
+
+
diff --git a/src/silx/resources/opencl/proj.cl b/src/silx/resources/opencl/proj.cl
new file mode 100644
index 0000000..2a6d870
--- /dev/null
+++ b/src/silx/resources/opencl/proj.cl
@@ -0,0 +1,345 @@
+/*
+ * Copyright (C) 2017-2017 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Based on the projector of PyHST2 - https://forge.epn-campus.eu/projects/pyhst2
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+/*******************************************************************************/
+/************************ GPU VERSION (with textures) **************************/
+/*******************************************************************************/
+
+#ifndef DONT_USE_TEXTURES
+kernel void forward_kernel(
+ global float *d_Sino,
+ read_only image2d_t d_slice,
+ int dimslice,
+ int num_bins,
+ global float* angles_per_project ,
+ float axis_position,
+ global float *d_axis_corrections,
+ global int* d_beginPos ,
+ global int* d_strideJoseph,
+ global int* d_strideLine ,
+ int num_projections,
+ int dimrecx,
+ int dimrecy,
+ float cpu_offset_x,
+ float cpu_offset_y,
+ int josephnoclip,
+ int normalize)
+{
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+ const int tidx = get_local_id(0);
+ const int bidx = get_group_id(0);
+ const int tidy = get_local_id(1);
+ const int bidy = get_group_id(1);
+ float angle;
+ float cos_angle,sin_angle ;
+
+ local float corrections[16];
+ local int beginPos[16*2];
+ local int strideJoseph[16*2];
+ local int strideLine[16*2];
+
+ // thread will use corrections[tidy]
+ // All are read by first warp
+ int offset, OFFSET;
+ switch(tidy) {
+ case 0:
+ corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx];
+ break;
+ case 1:
+ case 2:
+ offset = 16*(tidy-1);
+ OFFSET = dimrecy*(tidy-1);
+ beginPos [offset + tidx ]= d_beginPos[ OFFSET+ bidy*16+tidx] ;
+ break;
+ case 3:
+ case 4:
+ offset = 16*(tidy-3);
+ OFFSET = dimrecy*(tidy-3);
+ strideJoseph[offset + tidx ]= d_strideJoseph[OFFSET + bidy*16+tidx] ;
+ break;
+ case 5:
+ case 6:
+ offset = 16*(tidy-5);
+ OFFSET = dimrecy*(tidy-5);
+ strideLine[offset + tidx ]= d_strideLine[OFFSET + bidy*16+tidx] ;
+ break;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ angle = angles_per_project[ bidy*16+tidy ] ;
+ cos_angle = cos(angle);
+ sin_angle = sin(angle);
+
+ if(fabs(cos_angle) > 0.70710678f ) {
+ if( cos_angle>0) {
+ cos_angle = cos(angle);
+ sin_angle = sin(angle);
+ }
+ else {
+ cos_angle = -cos(angle);
+ sin_angle = -sin(angle);
+ }
+ }
+ else {
+ if( sin_angle>0) {
+ cos_angle = sin(angle);
+ sin_angle = -cos(angle);
+ }
+ else {
+ cos_angle = -sin(angle);
+ sin_angle = cos(angle);
+ }
+ }
+ float res=0.0f;
+ float axis_corr = axis_position + corrections[ tidy ];
+ float axis = axis_position ;
+ float xpix = ( bidx*16+tidx )-cpu_offset_x;
+ float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ;
+
+ float shiftJ = sin_angle/cos_angle;
+ float x1 = min(-sin_angle/cos_angle ,0.f);
+ float x2 = max(-sin_angle/cos_angle ,0.f);
+
+ float Area;
+ Area=1.0f/cos_angle;
+ int stlA, stlB , stlAJ, stlBJ ;
+ stlA=strideLine[16+tidy];
+ stlB=strideLine[tidy];
+ stlAJ=strideJoseph[16+tidy];
+ stlBJ=strideJoseph[tidy];
+
+ int beginA = beginPos[16+tidy ];
+ int beginB = beginPos[tidy ];
+ float add;
+ int l;
+
+ if(josephnoclip) {
+ for(int j=0; j<dimslice; j++) { // j: Joseph
+ x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
+ x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
+ add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2);
+ res += add;
+ posx += shiftJ;
+ }
+ }
+ else {
+ for(int j=0; j<dimslice; j++) { // j: Joseph
+ x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
+ x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
+ l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ;
+ add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2);
+ res += add*l;
+ posx += shiftJ;
+ }
+ }
+
+ if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) {
+ res *= Area;
+ if (normalize)
+ res *= M_PI_F * 0.5f / num_projections;
+ d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res;
+ }
+}
+#endif
+
+
+/*******************************************************************************/
+/********************* CPU VERSION (without textures) **************************/
+/*******************************************************************************/
+
+
+kernel void forward_kernel_cpu(
+ global float *d_Sino,
+ global float* d_slice,
+ int dimslice,
+ int num_bins,
+ global float* angles_per_project ,
+ float axis_position,
+ global float *d_axis_corrections,
+ global int* d_beginPos ,
+ global int* d_strideJoseph,
+ global int* d_strideLine ,
+ int num_projections,
+ int dimrecx,
+ int dimrecy,
+ float cpu_offset_x,
+ float cpu_offset_y,
+ int josephnoclip,
+ int normalize)
+{
+
+ const int tidx = get_local_id(0);
+ const int bidx = get_group_id(0);
+ const int tidy = get_local_id(1);
+ const int bidy = get_group_id(1);
+ float angle;
+ float cos_angle,sin_angle ;
+
+ local float corrections[16];
+ local int beginPos[16*2];
+ local int strideJoseph[16*2];
+ local int strideLine[16*2];
+
+ // thread will use corrections[tidy]
+ // All are read by first warp
+ int offset, OFFSET;
+ switch(tidy) {
+ case 0:
+ corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx];
+ break;
+ case 1:
+ case 2:
+ offset = 16*(tidy-1);
+ OFFSET = dimrecy*(tidy-1);
+ beginPos [offset + tidx ]= d_beginPos[ OFFSET+ bidy*16+tidx] ;
+ break;
+ case 3:
+ case 4:
+ offset = 16*(tidy-3);
+ OFFSET = dimrecy*(tidy-3);
+ strideJoseph[offset + tidx ]= d_strideJoseph[OFFSET + bidy*16+tidx] ;
+ break;
+ case 5:
+ case 6:
+ offset = 16*(tidy-5);
+ OFFSET = dimrecy*(tidy-5);
+ strideLine[offset + tidx ]= d_strideLine[OFFSET + bidy*16+tidx] ;
+ break;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ angle = angles_per_project[ bidy*16+tidy ] ;
+ cos_angle = cos(angle);
+ sin_angle = sin(angle);
+
+ if(fabs(cos_angle) > 0.70710678f ) {
+ if( cos_angle>0) {
+ cos_angle = cos(angle);
+ sin_angle = sin(angle);
+ }
+ else {
+ cos_angle = -cos(angle);
+ sin_angle = -sin(angle);
+ }
+ }
+ else {
+ if( sin_angle>0) {
+ cos_angle = sin(angle);
+ sin_angle = -cos(angle);
+ }
+ else {
+ cos_angle = -sin(angle);
+ sin_angle = cos(angle);
+ }
+ }
+ float res=0.0f;
+ float axis_corr = axis_position + corrections[ tidy ];
+ float axis = axis_position ;
+ float xpix = ( bidx*16+tidx )-cpu_offset_x;
+ float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ;
+
+ float shiftJ = sin_angle/cos_angle;
+ float x1 = min(-sin_angle/cos_angle ,0.f);
+ float x2 = max(-sin_angle/cos_angle ,0.f);
+
+ float Area;
+ Area=1.0f/cos_angle;
+ int stlA, stlB , stlAJ, stlBJ ;
+ stlA=strideLine[16+tidy];
+ stlB=strideLine[tidy];
+ stlAJ=strideJoseph[16+tidy];
+ stlBJ=strideJoseph[tidy];
+
+ int beginA = beginPos[16+tidy ];
+ int beginB = beginPos[tidy ];
+ int l;
+
+ int ym, yp, xm, xp;
+ float yc, xc;
+ float val;
+ if(josephnoclip) {
+ for(int j=0; j<dimslice; j++) { // j: Joseph
+ x1 = beginA +(posx)*stlA + (j)*stlAJ+1.0f;
+ x2 = beginB +(posx)*stlB + (j)*stlBJ+1.0f;
+ /*
+ Bilinear interpolation
+ */
+ yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped
+ ym = (int) floor(yc); // y_minus
+ yp = (int) ceil(yc); // y_plus
+
+ xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f)); // x_clipped
+ xm = (int) floor(xc); // x_minus
+ xp = (int) ceil(xc); // x_plus
+
+ if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm];
+ else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm));
+ else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym));
+ else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc))
+ + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc))
+ + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm))
+ + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm));
+ // ----------
+ res += val;
+ posx += shiftJ;
+ }
+ }
+ else {
+ for(int j=0; j<dimslice; j++) { // j: Joseph
+ x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
+ x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
+ l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ;
+ /*
+ Bilinear interpolation
+ */
+ yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped
+ ym = (int) floor(yc); // y_minus
+ yp = (int) ceil(yc); // y_plus
+
+ xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f)); // x_clipped
+ xm = (int) floor(xc); // x_minus
+ xp = (int) ceil(xc); // x_plus
+
+ if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm];
+ else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm));
+ else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym));
+ else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc))
+ + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc))
+ + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm))
+ + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm));
+ // ----------
+ res += val*l;
+ posx += shiftJ;
+ }
+ }
+
+ if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) {
+ res *= Area;
+ if (normalize)
+ res *= M_PI_F * 0.5f / num_projections;
+ d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res;
+ }
+}
diff --git a/src/silx/resources/opencl/sparse.cl b/src/silx/resources/opencl/sparse.cl
new file mode 100644
index 0000000..c99a0e9
--- /dev/null
+++ b/src/silx/resources/opencl/sparse.cl
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2016-2019 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef IMAGE_WIDTH
+ #error "Please define IMAGE_WIDTH parameter"
+#endif
+
+#ifndef DTYPE
+ #error "Please define DTYPE parameter"
+#endif
+
+#ifndef IDX_DTYPE
+ #error "Please define IDX_DTYPE parameter"
+#endif
+
+
+
+/**
+ * Densify a matric from CSR format to "dense" 2D format.
+ * The input CSR data consists in 3 arrays: (data, ind, iptr).
+ * The output array is a 2D array of dimensions IMAGE_WIDTH * image_height.
+ *
+ * data: 1D array containing the nonzero data items.
+ * ind: 1D array containing the column indices of the nonzero data items.
+ * iptr: 1D array containing indirection indices, such that range
+ * [iptr[i], iptr[i+1]-1] of "data" and "ind" contain the relevant data
+ * of output row "i".
+ * output: 2D array containing the densified data.
+ * image_height: height (number of rows) of the output data.
+**/
+
+kernel void densify_csr(
+ const global DTYPE* data,
+ const global IDX_DTYPE* ind,
+ const global IDX_DTYPE* iptr,
+ global DTYPE* output,
+ int image_height
+)
+{
+ uint tid = get_local_id(0);
+ uint row_idx = get_global_id(1);
+ if ((tid >= IMAGE_WIDTH) || (row_idx >= image_height)) return;
+
+ local DTYPE line[IMAGE_WIDTH];
+
+ // Memset
+ //~ #pragma unroll
+ for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) {
+ if (tid+k >= IMAGE_WIDTH) break;
+ line[tid+k] = 0.0f;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+
+ uint start = iptr[row_idx], end = iptr[row_idx+1];
+ //~ #pragma unroll
+ for (int k = start; k < end; k += get_local_size(0)) {
+ // Current work group handles one line of the final array
+ // on the current line, write data[start+tid] at column index ind[start+tid]
+ if (k+tid < end)
+ line[ind[k+tid]] = data[k+tid];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // write the current line (shared mem) into the output array (global mem)
+ //~ #pragma unroll
+ for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) {
+ output[row_idx*IMAGE_WIDTH + tid+k] = line[tid+k];
+ if (k+tid >= IMAGE_WIDTH) return;
+ }
+}
diff --git a/src/silx/resources/opencl/statistics.cl b/src/silx/resources/opencl/statistics.cl
new file mode 100644
index 0000000..47d925b
--- /dev/null
+++ b/src/silx/resources/opencl/statistics.cl
@@ -0,0 +1,283 @@
+/*
+ * Project: Silx statics calculation
+ *
+ *
+ *
+ * Copyright (C) 2012-2021 European Synchrotron Radiation Facility
+ * Grenoble, France
+ *
+ * Principal authors: J. Kieffer (kieffer@esrf.fr)
+ * Last revision: 17/05/2021
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/**
+ * \file
+ *
+ * \brief OpenCL kernels for min, max, mean and std calculation
+ *
+ * This module provides two functions to perform the `map` and the `reduce`
+ * to be used with pyopencl reduction to calculate in a single pass the minimum,
+ * maximum, sum, count, mean and standart deviation for an array.
+ *
+ * So beside the reduction mechanisme from pyopencl, this algorithm implementes equations from
+ * https://dbs.ifi.uni-heidelberg.de/files/Team/eschubert/publications/SSDBM18-covariance-authorcopy.pdf
+ *
+ * let A and B be 2 disjoint partition of all elements
+ *
+ * Omega_A = sum_{i \in A}(omaga_i) The sum of all weights
+ * V_A is the weighted sum of the signal over the partition
+ * VV_A is the weighted sum of deviation squarred
+ *
+ * With this the mean is V / Omega and the variance equals VV / omega.
+ *
+ * Redction operator performs:
+ * Omega_{AB} = Omega_A + Omega_B
+ * V_{AB} = V_A + V_B
+ * VV{AB} = VV_A + VV_B + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB})
+ *
+ * To avoid any numerical degradation, the doubleword library is used to perform all floating point operations.
+ *
+ */
+#include "for_eclipse.h"
+
+/* \brief read a value at given position and initialize the float8 for the reduction
+ *
+ * The float8 returned contains:
+ * s0: minimum value
+ * s1: maximum value
+ * s2: Omega_h count number of valid pixels
+ * s3: Omega_l error associated to the count
+ * s4: V_h sum of signal
+ * s5: V_l error associated to the sum of signal
+ * s6: VVh variance*count
+ * s7: VVl error associated to variance*count
+ *
+ */
+static inline float8 map_statistics(global float* data, int position)
+{
+ float value = data[position];
+ float8 result;
+
+ if (isfinite(value))
+ {
+ result = (float8)(value, value, 1.0f, 0.0f, value, 0.0f, 0.0f, 0.0f);
+ // min max cnt_h cnt_l sum_h sum_l M2_h M2_l
+ }
+ else
+ {
+ result = (float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
+ // min max cnt_h cnt_l sum_h sum_l M2_h M2_l
+ }
+ return result;
+}
+
+/* \brief reduction function associated to the statistics.
+ *
+ * this is described in:
+ * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+ *
+ * The float8 used here contain contains:
+ * s0: minimum value
+ * s1: maximum value
+ * s2: count number of valid pixels
+ * s3: count (error associated to)
+ * s4: sum of valid pixels
+ * s5: sum (error associated to)
+ * s6: M=variance*(count-1)
+ * s7: M=variance*(count-1) (error associated to)
+ *
+ */
+
+static inline float8 reduce_statistics(float8 a, float8 b)
+{
+ float2 sum_a, sum_b, M_a, M_b, count_a, count_b;
+ float2 count, sum;
+ float2 M, delta, delta2, omega3;
+ //test on count
+ if (a.s2 == 0.0f)
+ {
+ return b;
+ }
+ else
+ {
+ count_a = (float2)(a.s2, a.s3);
+ sum_a = (float2)(a.s4, a.s5);
+ M_a = (float2)(a.s6, a.s7);
+ }
+ //test on count
+ if (b.s2 == 0.0f)
+ {
+ return a;
+ }
+ else
+ {
+ count_b = (float2)(b.s2, b.s3);
+ sum_b = (float2)(b.s4, b.s5);
+ M_b = (float2)(b.s6, b.s7);
+ }
+ // count = count_a + count_b
+ count = dw_plus_dw(count_a, count_b);
+ // sum = sum_a + sum_b
+ sum = dw_plus_dw(sum_a, sum_b);
+
+ // M2 = M_a + M_b + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB})
+ M = dw_plus_dw(M_a, M_b);
+ delta = dw_plus_dw(dw_times_dw(count_b, sum_a),
+ -dw_times_dw(count_a, sum_b));
+ delta2 = dw_times_dw(delta, delta);
+ omega3 = dw_times_dw(count, dw_times_dw(count_a, count_b));
+ M = dw_plus_dw(M, dw_div_dw(delta2, omega3));
+
+ float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1),
+ count.s0, count.s1,
+ sum.s0, sum.s1,
+ M.s0, M.s1);
+ return result;
+}
+
+/* \brief reduction function associated to the statistics without compensated arithmetics.
+ *
+ * this is described in:
+ * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+ *
+ * The float8 used here contain contains:
+ * s0: minimum value
+ * s1: maximum value
+ * s2: count number of valid pixels (major)
+ * s3: count number of valid pixels (minor)
+ * s4: sum of valid pixels (major)
+ * s5: sum of valid pixels (minor)
+ * s6: variance*count (major)
+ * s7: variance*count (minor)
+ *
+ */
+
+static inline float8 reduce_statistics_simple(float8 a, float8 b)
+{
+ float sum_a, sum_b, M_a, M_b, count_a, count_b;
+
+ //test on count
+ if (a.s2 == 0.0f)
+ {
+ return b;
+ }
+ else
+ {
+ count_a = a.s2;
+ sum_a = a.s4;
+ M_a = a.s6;
+ }
+ //test on count
+ if (b.s2 == 0.0f)
+ {
+ return a;
+ }
+ else
+ {
+ count_b = b.s2;
+ sum_b = b.s4;
+ M_b = b.s6;
+ }
+ float count = count_a + count_b;
+ float sum = sum_a + sum_b;
+ float delta = sum_a*count_b - sum_b*count_a;
+ float delta2 = delta * delta;
+ float M2 = M_a + M_b + delta2/(count*count_a*count_b);
+ float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1),
+ count, 0.0f,
+ sum, 0.0f,
+ M2, 0.0f);
+ return result;
+}
+
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+// unpack a double in two floats such as the sum of the two is the double number
+static inline float2 unpack_double(double inp){
+ float major = (float) inp;
+ float minor = (float) (inp - major);
+ return (float2)(major, minor);
+}
+
+// pack two floats into a double
+static inline double pack_double(float major, float minor){
+ return (double)major + (double)minor;
+}
+
+/* \brief reduction function associated to the statistics using double precision arithmetics
+ *
+ * this is described in:
+ * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+ *
+ * The float8 used here contain contains:
+ * s0: minimum value
+ * s1: maximum value
+ * s2: count number of valid pixels (major)
+ * s3: count number of valid pixels (minor)
+ * s4: sum of valid pixels (major)
+ * s5: sum of valid pixels (minor)
+ * s6: variance*count (major)
+ * s7: variance*count (minor)
+ *
+ */
+
+static inline float8 reduce_statistics_double(float8 a, float8 b)
+{
+ double sum_a, sum_b, M_a, M_b, count_a, count_b;
+
+ //test on count
+ if (a.s2 == 0.0)
+ {
+ return b;
+ }
+ else
+ {
+ count_a = pack_double(a.s2, a.s3);
+ sum_a = pack_double(a.s4,a.s5);
+ M_a = pack_double(a.s6, a.s7);
+ }
+ //test on count
+ if (b.s2 == 0.0)
+ {
+ return a;
+ }
+ else
+ {
+ count_b = pack_double(b.s2, b.s3);
+ sum_b = pack_double(b.s4, b.s5);
+ M_b = pack_double(b.s6, b.s7);
+ }
+ double count = count_a + count_b;
+ double sum = sum_a + sum_b;
+ double delta = sum_a*count_b - sum_b*count_a;
+ double delta2 = delta * delta;
+ double M2 = M_a + M_b + delta2/(count*count_a*count_b);
+ float8 result = (float8)((float2)(min(a.s0, b.s0), max(a.s1, b.s1)),
+ unpack_double(count),
+ unpack_double( sum),
+ unpack_double( M2));
+ return result;
+}
+
+#endif \ No newline at end of file