summaryrefslogtreecommitdiff
path: root/silx/resources/opencl
diff options
context:
space:
mode:
Diffstat (limited to 'silx/resources/opencl')
-rw-r--r--silx/resources/opencl/addition.cl42
-rw-r--r--silx/resources/opencl/array_utils.cl73
-rw-r--r--silx/resources/opencl/backproj.cl232
-rw-r--r--silx/resources/opencl/backproj_helper.cl68
-rw-r--r--silx/resources/opencl/bitonic.cl569
-rw-r--r--silx/resources/opencl/codec/byte_offset.cl235
-rw-r--r--silx/resources/opencl/convolution.cl312
-rw-r--r--silx/resources/opencl/convolution_textures.cl374
-rw-r--r--silx/resources/opencl/doubleword.cl115
-rw-r--r--silx/resources/opencl/image/cast.cl181
-rw-r--r--silx/resources/opencl/image/histogram.cl178
-rw-r--r--silx/resources/opencl/image/map.cl85
-rw-r--r--silx/resources/opencl/image/max_min.cl207
-rw-r--r--silx/resources/opencl/kahan.cl143
-rw-r--r--silx/resources/opencl/linalg.cl88
-rw-r--r--silx/resources/opencl/medfilt.cl141
-rw-r--r--silx/resources/opencl/preprocess.cl567
-rw-r--r--silx/resources/opencl/proj.cl345
-rw-r--r--silx/resources/opencl/sparse.cl94
-rw-r--r--silx/resources/opencl/statistics.cl283
20 files changed, 0 insertions, 4332 deletions
diff --git a/silx/resources/opencl/addition.cl b/silx/resources/opencl/addition.cl
deleted file mode 100644
index 35d7996..0000000
--- a/silx/resources/opencl/addition.cl
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Project: SIFT: An algorithm for image alignement
- *
- * Copyright (C) 2013-2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-// "Hello_world" kernel to test if OpenCL is actually working
-kernel void addition(global float* a,
- global float* b,
- global float* res,
- int N)
-{
- int i = get_global_id(0);
- if( i<N ){
- res[i] = a[i] + b[i];
- }
-}
diff --git a/silx/resources/opencl/array_utils.cl b/silx/resources/opencl/array_utils.cl
deleted file mode 100644
index 6f78921..0000000
--- a/silx/resources/opencl/array_utils.cl
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * 2D Memcpy for float* arrays,
- * replacing pyopencl "enqueue_copy" which does not work for rectangular copies.
- * ALL THE SIZES/OFFSETS ARE SPECIFIED IN PIXELS, NOT IN BYTES.
- * In the (x, y) convention, x is the fast index (as in CUDA).
- *
- * :param dst: destination array
- * :param src: source array
- * :param dst_width: width of the dst array
- * :param src_width: width of the src array
- * :param dst_offset: tuple with the offset (x, y) in the dst array
- * :param src_offset: tuple with the offset (x, y) in the src array
- * :param transfer_shape: shape of the transfer array in the form (x, y)
- *
- */
-kernel void cpy2d(
- global float* dst,
- global float* src,
- int dst_width,
- int src_width,
- int2 dst_offset,
- int2 src_offset,
- int2 transfer_shape)
-{
- int gidx = get_global_id(0), gidy = get_global_id(1);
- if (gidx < transfer_shape.x && gidy < transfer_shape.y) {
- dst[(dst_offset.y + gidy)*dst_width + (dst_offset.x + gidx)] = src[(src_offset.y + gidy)*src_width + (src_offset.x + gidx)];
- }
-}
-
-
-// Looks like cfloat_t and cfloat_mul are not working, yet specified in
-// pyopencl documentation. Here we are using float2 as in all available examples
-// #include <pyopencl-complex.h>
-// typedef cfloat_t complex;
-
-static inline float2 complex_mul(float2 a, float2 b) {
- float2 res = (float2) (0, 0);
- res.x = a.x * b.x - a.y * b.y;
- res.y = a.y * b.x + a.x * b.y;
- return res;
-}
-
-// arr2D *= arr1D (line by line, i.e along fast dim)
-kernel void inplace_complex_mul_2Dby1D(
- global float2* arr2D,
- global float2* arr1D,
- int width,
- int height)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
- if ((x >= width) || (y >= height)) return;
- int i = y*width + x;
- arr2D[i] = complex_mul(arr2D[i], arr1D[x]);
-}
-
-
-// arr3D *= arr1D (along fast dim)
-kernel void inplace_complex_mul_3Dby1D(
- global float2* arr3D,
- global float2* arr1D,
- int width,
- int height,
- int depth)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
- int z = get_global_id(2);
- if ((x >= width) || (y >= height) || (z >= depth)) return;
- int i = (z*height + y)*width + x;
- arr3D[i] = complex_mul(arr3D[i], arr1D[x]);
-}
diff --git a/silx/resources/opencl/backproj.cl b/silx/resources/opencl/backproj.cl
deleted file mode 100644
index da15131..0000000
--- a/silx/resources/opencl/backproj.cl
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Project: silx: filtered backprojection
- *
- * Copyright (C) 2016-2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: A. Mirone
- * P. Paleo
- *
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-/*******************************************************************************/
-/************************ GPU VERSION (with textures) **************************/
-/*******************************************************************************/
-
-#ifndef DONT_USE_TEXTURES
-kernel void backproj_kernel(
- int num_proj,
- int num_bins,
- float axis_position,
- global float *d_SLICE,
- read_only image2d_t d_sino,
- float gpu_offset_x,
- float gpu_offset_y,
- global float * d_cos_s, // precalculated cos(theta[i])
- global float* d_sin_s, // precalculated sin(theta[i])
- global float* d_axis_s, // array of axis positions (n_projs)
- local float* shared2) // 768B of local mem
-{
- const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
- const int tidx = get_local_id(0); //threadIdx.x;
- const int bidx = get_group_id(0); //blockIdx.x;
- const int tidy = get_local_id(1); //threadIdx.y;
- const int bidy = get_group_id(1); //blockIdx.y;
-
- local float sh_cos[256];
- local float sh_sin[256];
- local float sh_axis[256];
-
- float pcos, psin;
- float h0, h1, h2, h3;
- const float apos_off_x= gpu_offset_x - axis_position ;
- const float apos_off_y= gpu_offset_y - axis_position ;
- float acorr05;
- float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
-
- const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ;
- const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ;
-
- int read=0;
- for(int proj=0; proj<num_proj; proj++) {
- if(proj>=read) {
- barrier(CLK_LOCAL_MEM_FENCE);
- int ip = tidy*16+tidx;
- if( read+ip < num_proj) {
- sh_cos [ip] = d_cos_s[read+ip] ;
- sh_sin [ip] = d_sin_s[read+ip] ;
- sh_axis[ip] = d_axis_s[read+ip] ;
- }
- read=read+256; // 256=16*16 block size
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- pcos = sh_cos[256-read + proj] ;
- psin = sh_sin[256-read + proj] ;
-
- acorr05 = sh_axis[256 - read + proj] ;
-
- h0 = (acorr05 + bx00*pcos - by00*psin);
- h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
- h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
- h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
-
- if(h0>=0 && h0<num_bins) res0 += read_imagef(d_sino, sampler, (float2) (h0 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h0 +0.5f,proj +0.5f);
- if(h1>=0 && h1<num_bins) res1 += read_imagef(d_sino, sampler, (float2) (h1 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h1 +0.5f,proj +0.5f);
- if(h2>=0 && h2<num_bins) res2 += read_imagef(d_sino, sampler, (float2) (h2 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h2 +0.5f,proj +0.5f);
- if(h3>=0 && h3<num_bins) res3 += read_imagef(d_sino, sampler, (float2) (h3 +0.5f,proj +0.5f)).x; // tex2D(texprojs,h3 +0.5f,proj +0.5f);
- }
- d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
- d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
- d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
- d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
-}
-#endif
-
-
-
-
-/*******************************************************************************/
-/********************* CPU VERSION (without textures) **************************/
-/*******************************************************************************/
-
-
-#define CLIP_MAX(x, N) (fmin(fmax(x, 0.0f), (N - 1.0f)))
-
-#define FLOORCEIL_x(x) {\
- xm = (int) floor(x);\
- xp = (int) ceil(x);\
-}
-
-#define ADJACENT_PIXELS_VALS(arr, Nx, y, xm, xp) ((float2) (arr[y*Nx+xm], arr[y*Nx+xp]))
-
-//Simple linear interpolator for working on the GPU
-static float linear_interpolation(float2 vals,
- float x,
- int xm,
- int xp)
-{
- if (xm == xp)
- return vals.s0;
- else
- return (vals.s0 * (xp - x)) + (vals.s1 * (x - xm));
-}
-
-/**
- *
- * Same kernel as backproj_kernel, but targets the CPU (no texture)
- *
-**/
-kernel void backproj_cpu_kernel(
- int num_proj,
- int num_bins,
- float axis_position,
- global float *d_SLICE,
- global float* d_sino,
- float gpu_offset_x,
- float gpu_offset_y,
- global float * d_cos_s, // precalculated cos(theta[i])
- global float * d_sin_s, // precalculated sin(theta[i])
- global float * d_axis_s, // array of axis positions (n_projs)
- local float* shared2) // 768B of local mem
-{
- const int tidx = get_local_id(0); //threadIdx.x;
- const int bidx = get_group_id(0); //blockIdx.x;
- const int tidy = get_local_id(1); //threadIdx.y;
- const int bidy = get_group_id(1); //blockIdx.y;
-
- local float sh_cos[256];
- local float sh_sin[256];
- local float sh_axis[256];
-
- float pcos, psin;
- float h0, h1, h2, h3;
- const float apos_off_x= gpu_offset_x - axis_position ;
- const float apos_off_y= gpu_offset_y - axis_position ;
- float acorr05;
- float res0 = 0, res1 = 0, res2 = 0, res3 = 0;
-
- const float bx00 = (32 * bidx + 2 * tidx + 0 + apos_off_x ) ;
- const float by00 = (32 * bidy + 2 * tidy + 0 + apos_off_y ) ;
-
- int read=0;
- for(int proj=0; proj<num_proj; proj++) {
- if(proj>=read) {
- barrier(CLK_LOCAL_MEM_FENCE);
- int ip = tidy*16+tidx;
- if( read+ip < num_proj) {
- sh_cos [ip] = d_cos_s[read+ip] ;
- sh_sin [ip] = d_sin_s[read+ip] ;
- sh_axis[ip] = d_axis_s[read+ip] ;
- }
- read=read+256; // 256=16*16 block size
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- pcos = sh_cos[256-read + proj] ;
- psin = sh_sin[256-read + proj] ;
-
- acorr05 = sh_axis[256 - read + proj] ;
-
- h0 = (acorr05 + bx00*pcos - by00*psin);
- h1 = (acorr05 + (bx00+0)*pcos - (by00+1)*psin);
- h2 = (acorr05 + (bx00+1)*pcos - (by00+0)*psin);
- h3 = (acorr05 + (bx00+1)*pcos - (by00+1)*psin);
-
-
- float x;
- int ym, xm, xp;
- ym = proj;
- float2 vals;
-
- if(h0>=0 && h0<num_bins) {
- x = CLIP_MAX(h0, num_bins);
- FLOORCEIL_x(x);
- vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
- res0 += linear_interpolation(vals, x, xm, xp);
- }
- if(h1>=0 && h1<num_bins) {
- x = CLIP_MAX(h1, num_bins);
- FLOORCEIL_x(x);
- vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
- res1 += linear_interpolation(vals, x, xm, xp);
- }
- if(h2>=0 && h2<num_bins) {
- x = CLIP_MAX(h2, num_bins);
- FLOORCEIL_x(x);
- vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
- res2 += linear_interpolation(vals, x, xm, xp);
- }
- if(h3>=0 && h3<num_bins) {
- x = CLIP_MAX(h3, num_bins);
- FLOORCEIL_x(x);
- vals = ADJACENT_PIXELS_VALS(d_sino, num_bins, ym, xm, xp);
- res3 += linear_interpolation(vals, x, xm, xp);
- }
- }
- d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 0] = res0;
- d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 0] = res1;
- d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+0) + bidx*32 + tidx*2 + 1] = res2;
- d_SLICE[ 32*get_num_groups(0)*(bidy*32+tidy*2+1) + bidx*32 + tidx*2 + 1] = res3;
-}
-
diff --git a/silx/resources/opencl/backproj_helper.cl b/silx/resources/opencl/backproj_helper.cl
deleted file mode 100644
index b1590f8..0000000
--- a/silx/resources/opencl/backproj_helper.cl
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Project: silx: backprojection helper functions
- *
- * Copyright (C) 2016-2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: P. Paleo
- * J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-kernel void mult( global float2* d_sino,
- global float2* d_filter,
- int num_bins,
- int num_projs)
-{
- int gid0 = get_global_id(0);
- int gid1 = get_global_id(1);
- if (gid0 < num_bins && gid1 < num_projs)
- {
- // d_sino[gid1*num_bins+gid0] *= d_filter[gid0];
- d_sino[gid1*num_bins+gid0].x *= d_filter[gid0].x;
- d_sino[gid1*num_bins+gid0].y *= d_filter[gid0].x;
- }
-}
-
-// copy only the real part of the valid data to the real array
-kernel void cpy2d_c2r(
- global float* d_sino,
- global float2* d_sino_complex,
- int num_bins,
- int num_projs,
- int fft_size)
-{
- int gid0 = get_global_id(0);
- int gid1 = get_global_id(1);
- if (gid0 < num_bins && gid1 < num_projs) {
- d_sino[gid1*num_bins+gid0] = d_sino_complex[gid1*fft_size+gid0].x;
- }
-}
-
-
-
-
-
-
-
diff --git a/silx/resources/opencl/bitonic.cl b/silx/resources/opencl/bitonic.cl
deleted file mode 100644
index 4096ce8..0000000
--- a/silx/resources/opencl/bitonic.cl
+++ /dev/null
@@ -1,569 +0,0 @@
-/*############################################################################
-# Sort elements within a vector by Matthew Scarpino,
-# Taken from his book "OpenCL in Action",
-# November 2011 ISBN 9781617290176
-# Original license for the code: "public domain"
-#
-# Originally this code is public domain. The MIT license has been added
-# by J. Kieffer (jerome.kieffer@esrf.eu) to provide a disclaimer.
-# J. Kieffer does not claim authorship of this code developed by .
-#
-# Copyright (c) 2011 Matthew Scarpino
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-#
-#############################################################################*/
-
-#define VECTOR_SORT_BOOK(input, dir) \
- comp = abs(input > shuffle(input, mask2)) ^ dir; \
- input = shuffle(input, comp * 2 + add2); \
- comp = abs(input > shuffle(input, mask1)) ^ dir; \
- input = shuffle(input, comp + add1); \
-
-
-#define VECTOR_SWAP_BOOK(in1, in2, dir) \
- input1 = in1; input2 = in2; \
- comp = (abs(input1 > input2) ^ dir) * 4 + add3; \
- in1 = shuffle2(input1, input2, comp); \
- in2 = shuffle2(input2, input1, comp); \
-
-
-// The _FILE extension correspond to the formula found in the "OpenCL in Action" supplementary files
-#define VECTOR_SORT_FILE(input, dir)\
- comp = (input < shuffle(input, mask2)) ^ dir;\
- input = shuffle(input, as_uint4(comp * 2 + add2));\
- comp = (input < shuffle(input, mask1)) ^ dir;\
- input = shuffle(input, as_uint4(comp + add1));\
-
-
-#define VECTOR_SWAP_FILE(input1, input2, dir)\
- temp = input1;\
- comp = ((input1 < input2) ^ dir) * 4 + add3;\
- input1 = shuffle2(input1, input2, as_uint4(comp));\
- input2 = shuffle2(input2, temp, as_uint4(comp));\
-
-
-
-// Functions to be called from an actual kernel.
-
-static float8 my_sort_file(uint local_id, uint group_id, uint local_size,
- float8 input, local float4 *l_data)
-{
- float4 input1, input2, temp;
- float8 output;
-
- int dir;
- uint id, size, stride;
- int4 comp;
-
- uint4 mask1 = (uint4)(1, 0, 3, 2);
- uint4 mask2 = (uint4)(2, 3, 0, 1);
- uint4 mask3 = (uint4)(3, 2, 1, 0);
-
- int4 add1 = (int4)(1, 1, 3, 3);
- int4 add2 = (int4)(2, 3, 2, 3);
- int4 add3 = (int4)(1, 2, 2, 3);
-
- // retrieve input data
- input1 = (float4)(input.s0, input.s1, input.s2, input.s3);
- input2 = (float4)(input.s4, input.s5, input.s6, input.s7);
-
- // Find global address
- id = local_id * 2;
-
- /* Sort input 1 - ascending */
- comp = input1 < shuffle(input1, mask1);
- input1 = shuffle(input1, as_uint4(comp + add1));
- comp = input1 < shuffle(input1, mask2);
- input1 = shuffle(input1, as_uint4(comp * 2 + add2));
- comp = input1 < shuffle(input1, mask3);
- input1 = shuffle(input1, as_uint4(comp + add3));
-
- /* Sort input 2 - descending */
- comp = input2 > shuffle(input2, mask1);
- input2 = shuffle(input2, as_uint4(comp + add1));
- comp = input2 > shuffle(input2, mask2);
- input2 = shuffle(input2, as_uint4(comp * 2 + add2));
- comp = input2 > shuffle(input2, mask3);
- input2 = shuffle(input2, as_uint4(comp + add3));
-
- /* Swap corresponding elements of input 1 and 2 */
- add3 = (int4)(4, 5, 6, 7);
- dir = - (int) (local_id % 2);
- temp = input1;
- comp = ((input1 < input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, as_uint4(comp));
- input2 = shuffle2(input2, temp, as_uint4(comp));
-
- /* Sort data and store in local memory */
- VECTOR_SORT_FILE(input1, dir);
- VECTOR_SORT_FILE(input2, dir);
- l_data[id] = input1;
- l_data[id+1] = input2;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* Create bitonic set */
- for(size = 2; size < local_size; size <<= 1) {
- dir = - (int) (local_id/size & 1) ;
-
- for(stride = size; stride > 1; stride >>= 1) {
- barrier(CLK_LOCAL_MEM_FENCE);
- id = local_id + (local_id/stride)*stride;
- VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- id = local_id * 2;
- input1 = l_data[id];
- input2 = l_data[id+1];
- temp = input1;
- comp = ((input1 < input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, as_uint4(comp));
- input2 = shuffle2(input2, temp, as_uint4(comp));
- VECTOR_SORT_FILE(input1, dir);
- VECTOR_SORT_FILE(input2, dir);
- l_data[id] = input1;
- l_data[id+1] = input2;
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- /* Perform bitonic merge */
- dir = - (int) (group_id % 2);
- for(stride = local_size; stride > 1; stride >>= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- id = local_id + (local_id/stride)*stride;
- VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* Perform final sort */
- id = local_id * 2;
- input1 = l_data[id];
- input2 = l_data[id+1];
- temp = input1;
- comp = ((input1 < input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, as_uint4(comp));
- input2 = shuffle2(input2, temp, as_uint4(comp));
- VECTOR_SORT_FILE(input1, dir);
- VECTOR_SORT_FILE(input2, dir);
-
- // setup output and return it
- output = (float8)(input1, input2);
- return output;
-}
-
-static float8 my_sort_book(uint local_id, uint group_id, uint local_size,
- float8 input, local float4 *l_data)
-{
- float4 input1, input2, temp;
- float8 output;
- uint4 comp, swap, mask1, mask2, add1, add2, add3;
- uint id, dir, size, stride;
- mask1 = (uint4)(1, 0, 3, 2);
- swap = (uint4)(0, 0, 1, 1);
- add1 = (uint4)(0, 0, 2, 2);
- mask2 = (uint4)(2, 3, 0, 1);
- add2 = (uint4)(0, 1, 0, 1);
- add3 = (uint4)(0, 1, 2, 3);
-
- // retrieve input data
- input1 = (float4)(input.s0, input.s1, input.s2, input.s3);
- input2 = (float4)(input.s4, input.s5, input.s6, input.s7);
-
- // Find global address
- id = local_id * 2;
-
- //Sort first vector
-
- comp = abs(input1 > shuffle(input1, mask1));
- input1 = shuffle(input1, comp ^ swap + add1);
- comp = abs(input1 > shuffle(input1, mask2));
- input1 = shuffle(input1, comp * 2 + add2);
- comp = abs(input1 > shuffle(input1, mask1));
- input1 = shuffle(input1, comp + add1);
-
- //Sort second vector
- comp = abs(input2 < shuffle(input2, mask1));
- input2 = shuffle(input2, comp ^ swap + add1);
- comp = abs(input2 < shuffle(input2, mask2));
- input2 = shuffle(input2, comp * 2 + add2);
- comp = abs(input2 < shuffle(input2, mask1));
- input2 = shuffle(input2, comp + add1);
-
- // Swap elements
- dir = local_id % 2;
- temp = input1;
- comp = (abs(input1 > input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, comp);
- input2 = shuffle2(input2, temp, comp);
- VECTOR_SORT_BOOK(input1, dir);
- VECTOR_SORT_BOOK(input2, dir);
- l_data[id] = input1;
- l_data[id+1] = input2;
-
- // Perform upper stages
- for(size = 2; size < local_size; size <<= 1)
- {
- dir = local_id/size & 1;
-
- //Perform lower stages
- for(stride = size; stride > 1; stride >>= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- id = local_id + (local_id/stride)*stride;
- VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- id = local_id * 2;
- input1 = l_data[id];
- input2 = l_data[id+1];
- temp = input1;
- comp = (abs(input1 > input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, comp);
- input2 = shuffle2(input2, temp, comp);
- VECTOR_SORT_BOOK(input1, dir);
- VECTOR_SORT_BOOK(input2, dir);
- l_data[id] = input1;
- l_data[id+1] = input2;
- }
- dir = group_id % 2;
-
- // Perform bitonic merge
- for(stride = local_size; stride > 1; stride >>= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- id = local_id + (local_id/stride)*stride;
- VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- id = local_id * 2;
- input1 = l_data[id]; input2 = l_data[id+1];
- temp = input1;
- comp = (abs(input1 > input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, comp);
- input2 = shuffle2(input2, temp, comp);
- VECTOR_SORT_BOOK(input1, dir);
- VECTOR_SORT_BOOK(input2, dir);
-
- // setup output and return it
- output = (float8)(input1, input2);
- return output;
-}
-
-
-
-//////////////
-// Kernels
-//////////////
-
-// Perform the sort on the whole array
-// dim0: wg=number_of_element/8
-
-kernel void bsort_all(global float4 *g_data,
- local float4 *l_data)
-{
- float4 input1, input2;
- float8 input, output;
- uint id, global_start;
- // Find global address
- id = get_local_id(0) * 2;
- global_start = get_group_id(0) * get_local_size(0) * 2 + id;
-
- input1 = g_data[global_start];
- input2 = g_data[global_start+1];
- input = (float8)(input1, input2);
- output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
- input, l_data);
- input1 = (float4) (output.s0, output.s1, output.s2, output.s3);
- input2 = (float4) (output.s4, output.s5, output.s6, output.s7);
- g_data[global_start] = input1;
- g_data[global_start+1] = input2;
-}
-
-
-// Perform the sort along the horizontal axis of a 2D image
-// dim0 = y: wg=1
-// dim1 = x: wg=number_of_element/8
-kernel void bsort_horizontal(global float *g_data,
- local float4 *l_data)
-{
- float8 input, output;
- uint id, global_start, offset;
-
- // Find global address
- offset = get_global_size(1)*get_global_id(0)*8;
- id = get_local_id(1) * 8;
- global_start = offset + get_group_id(1) * get_local_size(1) * 8 + id;
-
- input = (float8)(g_data[global_start ],
- g_data[global_start + 1],
- g_data[global_start + 2],
- g_data[global_start + 3],
- g_data[global_start + 4],
- g_data[global_start + 5],
- g_data[global_start + 6],
- g_data[global_start + 7]);
-
- output = my_sort_file(get_local_id(1), get_group_id(1), get_local_size(1),
- input, l_data);
-
- g_data[global_start ] = output.s0;
- g_data[global_start + 1] = output.s1;
- g_data[global_start + 2] = output.s2;
- g_data[global_start + 3] = output.s3;
- g_data[global_start + 4] = output.s4;
- g_data[global_start + 5] = output.s5;
- g_data[global_start + 6] = output.s6;
- g_data[global_start + 7] = output.s7;
-}
-
-
-// Perform the sort along the vertical axis
-// dim0 = y: wg=number_of_element/8
-// dim1 = x: wg=1
-// check if transposing +bsort_horizontal is not more efficient ?
-
-kernel void bsort_vertical(global float *g_data,
- local float4 *l_data)
-{
- // we need to read 8 float position along the vertical axis
- float8 input, output;
- uint id, global_start, padding;
-
- // Find global address
- padding = get_global_size(1);
- id = get_local_id(0) * 8 * padding + get_global_id(1);
- global_start = get_group_id(0) * get_local_size(0) * 8 * padding + id;
-
- input = (float8)(g_data[global_start ],
- g_data[global_start + padding ],
- g_data[global_start + 2*padding],
- g_data[global_start + 3*padding],
- g_data[global_start + 4*padding],
- g_data[global_start + 5*padding],
- g_data[global_start + 6*padding],
- g_data[global_start + 7*padding]);
-
- output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
- input, l_data);
- g_data[global_start ] = output.s0;
- g_data[global_start + padding ] = output.s1;
- g_data[global_start + 2*padding ] = output.s2;
- g_data[global_start + 3*padding ] = output.s3;
- g_data[global_start + 4*padding ] = output.s4;
- g_data[global_start + 5*padding ] = output.s5;
- g_data[global_start + 6*padding ] = output.s6;
- g_data[global_start + 7*padding ] = output.s7;
-}
-
-
-//Tested working reference kernel frm the book. This only works under Linux
-kernel void bsort_book(global float4 *g_data,
- local float4 *l_data) {
- float4 input1, input2, temp;
- uint4 comp, swap, mask1, mask2, add1, add2, add3;
- uint id, dir, global_start, size, stride;
- mask1 = (uint4)(1, 0, 3, 2);
- swap = (uint4)(0, 0, 1, 1);
- add1 = (uint4)(0, 0, 2, 2);
- mask2 = (uint4)(2, 3, 0, 1);
- add2 = (uint4)(0, 1, 0, 1);
- add3 = (uint4)(0, 1, 2, 3);
-
- // Find global address
- id = get_local_id(0) * 2;
- global_start = get_group_id(0) * get_local_size(0) * 2 + id;
-
- //Sort first vector
- input1 = g_data[global_start];
- input2 = g_data[global_start+1];
- comp = abs(input1 > shuffle(input1, mask1));
- input1 = shuffle(input1, comp ^ swap + add1);
- comp = abs(input1 > shuffle(input1, mask2));
- input1 = shuffle(input1, comp * 2 + add2);
- comp = abs(input1 > shuffle(input1, mask1));
- input1 = shuffle(input1, comp + add1);
-
- //Sort second vector
- comp = abs(input2 < shuffle(input2, mask1));
- input2 = shuffle(input2, comp ^ swap + add1);
- comp = abs(input2 < shuffle(input2, mask2));
- input2 = shuffle(input2, comp * 2 + add2);
- comp = abs(input2 < shuffle(input2, mask1));
- input2 = shuffle(input2, comp + add1);
-
- // Swap elements
- dir = get_local_id(0) % 2;
- temp = input1;
- comp = (abs(input1 > input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, comp);
- input2 = shuffle2(input2, temp, comp);
- VECTOR_SORT_BOOK(input1, dir);
- VECTOR_SORT_BOOK(input2, dir);
- l_data[id] = input1;
- l_data[id+1] = input2;
-
- // Perform upper stages
- for(size = 2; size < get_local_size(0); size <<= 1) {
- dir = get_local_id(0)/size & 1;
-
- //Perform lower stages
- for(stride = size; stride > 1; stride >>= 1) {
- barrier(CLK_LOCAL_MEM_FENCE);
- id = get_local_id(0) +
- (get_local_id(0)/stride)*stride;
- VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- id = get_local_id(0) * 2;
- input1 = l_data[id];
- input2 = l_data[id+1];
- temp = input1;
- comp = (abs(input1 > input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, comp);
- input2 = shuffle2(input2, temp, comp);
- VECTOR_SORT_BOOK(input1, dir);
- VECTOR_SORT_BOOK(input2, dir);
- l_data[id] = input1;
- l_data[id+1] = input2;
- }
- dir = get_group_id(0) % 2;
- // Perform bitonic merge
- for(stride = get_local_size(0); stride > 1; stride >>= 1) {
- barrier(CLK_LOCAL_MEM_FENCE);
- id = get_local_id(0) +
- (get_local_id(0)/stride)*stride;
- VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- id = get_local_id(0) * 2;
- input1 = l_data[id]; input2 = l_data[id+1];
- temp = input1;
- comp = (abs(input1 > input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, comp);
- input2 = shuffle2(input2, temp, comp);
- VECTOR_SORT_BOOK(input1, dir);
- VECTOR_SORT_BOOK(input2, dir);
- g_data[global_start] = input1;
- g_data[global_start+1] = input2;
- }
-
-//Tested working reference kernel from the addition files. This only works under any operating system
-/* Perform initial sort */
-kernel void bsort_file(global float4 *g_data, local float4 *l_data) {
-
- int dir;
- uint id, global_start, size, stride;
- float4 input1, input2, temp;
- int4 comp;
-
- uint4 mask1 = (uint4)(1, 0, 3, 2);
- uint4 mask2 = (uint4)(2, 3, 0, 1);
- uint4 mask3 = (uint4)(3, 2, 1, 0);
-
- int4 add1 = (int4)(1, 1, 3, 3);
- int4 add2 = (int4)(2, 3, 2, 3);
- int4 add3 = (int4)(1, 2, 2, 3);
-
- id = get_local_id(0) * 2;
- global_start = get_group_id(0) * get_local_size(0) * 2 + id;
-
- input1 = g_data[global_start];
- input2 = g_data[global_start+1];
-
- /* Sort input 1 - ascending */
- comp = input1 < shuffle(input1, mask1);
- input1 = shuffle(input1, as_uint4(comp + add1));
- comp = input1 < shuffle(input1, mask2);
- input1 = shuffle(input1, as_uint4(comp * 2 + add2));
- comp = input1 < shuffle(input1, mask3);
- input1 = shuffle(input1, as_uint4(comp + add3));
-
- /* Sort input 2 - descending */
- comp = input2 > shuffle(input2, mask1);
- input2 = shuffle(input2, as_uint4(comp + add1));
- comp = input2 > shuffle(input2, mask2);
- input2 = shuffle(input2, as_uint4(comp * 2 + add2));
- comp = input2 > shuffle(input2, mask3);
- input2 = shuffle(input2, as_uint4(comp + add3));
-
- /* Swap corresponding elements of input 1 and 2 */
- add3 = (int4)(4, 5, 6, 7);
- dir = - (int)(get_local_id(0) % 2);
- temp = input1;
- comp = ((input1 < input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, as_uint4(comp));
- input2 = shuffle2(input2, temp, as_uint4(comp));
-
- /* Sort data and store in local memory */
- VECTOR_SORT_FILE(input1, dir);
- VECTOR_SORT_FILE(input2, dir);
- l_data[id] = input1;
- l_data[id+1] = input2;
-
- /* Create bitonic set */
- for(size = 2; size < get_local_size(0); size <<= 1) {
- dir = - (int)(get_local_id(0)/size & 1);
-
- for(stride = size; stride > 1; stride >>= 1) {
- barrier(CLK_LOCAL_MEM_FENCE);
- id = get_local_id(0) + (get_local_id(0)/stride)*stride;
- VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- id = get_local_id(0) * 2;
- input1 = l_data[id]; input2 = l_data[id+1];
- temp = input1;
- comp = ((input1 < input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, as_uint4(comp));
- input2 = shuffle2(input2, temp, as_uint4(comp));
- VECTOR_SORT_FILE(input1, dir);
- VECTOR_SORT_FILE(input2, dir);
- l_data[id] = input1;
- l_data[id+1] = input2;
- }
-
- /* Perform bitonic merge */
- dir = - (int)(get_group_id(0) % 2);
- for(stride = get_local_size(0); stride > 1; stride >>= 1) {
- barrier(CLK_LOCAL_MEM_FENCE);
- id = get_local_id(0) + (get_local_id(0)/stride)*stride;
- VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* Perform final sort */
- id = get_local_id(0) * 2;
- input1 = l_data[id]; input2 = l_data[id+1];
- temp = input1;
- comp = ((input1 < input2) ^ dir) * 4 + add3;
- input1 = shuffle2(input1, input2, as_uint4(comp));
- input2 = shuffle2(input2, temp, as_uint4(comp));
- VECTOR_SORT_FILE(input1, dir);
- VECTOR_SORT_FILE(input2, dir);
- g_data[global_start] = input1;
- g_data[global_start+1] = input2;
-}
-
diff --git a/silx/resources/opencl/codec/byte_offset.cl b/silx/resources/opencl/codec/byte_offset.cl
deleted file mode 100644
index 56a24c4..0000000
--- a/silx/resources/opencl/codec/byte_offset.cl
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Project: SILX: A data analysis tool-kit
- *
- * Copyright (C) 2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/* To decompress CBF byte-offset compressed in parallel on GPU one needs to:
- * - Set all values in mask and exception counter to zero.
- * - Mark regions with exceptions and set values without exception.
- * This generates the values (zeros for exceptions), the exception mask,
- * counts the number of exception region and provides a start position for
- * each exception.
- * - Treat exceptions. For this, one thread in a workgoup treats a complete
- * masked region in a serial fashion. All regions are treated in parallel.
- * Values written at this stage are marked in the mask with -1.
- * - Double scan: inclusive cum sum for values, exclusive cum sum to generate
- * indices in output array. Values with mask = 1 are considered as 0.
- * - Compact and copy output by removing duplicated values in exceptions.
- */
-
-kernel void mark_exceptions(global char* raw,
- int size,
- int full_size,
- global int* mask,
- global int* values,
- global int* cnt,
- global int* exc)
-{
- int gid;
- gid = get_global_id(0);
- if (gid<size)
- {
- int value, position;
- value = raw[gid];
- if (value == -128)
- {
- int maxi;
- values[gid] = 0;
- position = atomic_inc(cnt);
- exc[position] = gid;
- maxi = size - 1;
- mask[gid] = 1;
- mask[min(maxi, gid+1)] = 1;
- mask[min(maxi, gid+2)] = 1;
-
- if (((int) raw[min(gid+1, maxi)] == 0) &&
- ((int) raw[min(gid+2, maxi)] == -128))
- {
- mask[min(maxi, gid+3)] = 1;
- mask[min(maxi, gid+4)] = 1;
- mask[min(maxi, gid+5)] = 1;
- mask[min(maxi, gid+6)] = 1;
- }
- }
- else
- { // treat simple data
-
- values[gid] = value;
- }
- }
- else if (gid<full_size)
- {
- mask[gid]=1;
- values[gid] = 0;
- }
-}
-
-//run with WG=1, as may as exceptions
-kernel void treat_exceptions(global char* raw, //raw compressed stream
- int size, //size of the raw compressed stream
- global int* mask, //tells if the value is masked
- global int* exc, //array storing the position of the start of exception zones
- global int* values)// stores decompressed values.
-{
- int gid = get_global_id(0);
- int inp_pos = exc[gid];
- if ((inp_pos<=0) || ((int)mask[inp_pos - 1] == 0))
- {
- int value, is_masked, next_value, inc;
- is_masked = (mask[inp_pos] != 0);
- while ((is_masked) && (inp_pos<size))
- {
- value = (int) raw[inp_pos];
- if (value == -128)
- { // this correspond to 16 bits exception
- uchar low_byte = raw[inp_pos+1];
- char high_byte = raw[inp_pos+2] ;
- next_value = high_byte<<8 | low_byte;
- if (next_value == -32768)
- { // this correspond to 32 bits exception
- uchar low_byte1 = raw[inp_pos+3],
- low_byte2 = raw[inp_pos+4],
- low_byte3 = raw[inp_pos+5];
- char high_byte4 = raw[inp_pos+6] ;
- value = high_byte4<<24 | low_byte3<<16 | low_byte2<<8 | low_byte1;
- inc = 7;
- }
- else
- {
- value = next_value;
- inc = 3;
- }
- }
- else
- {
- inc = 1;
- }
- values[inp_pos] = value;
- mask[inp_pos] = -1; // mark the processed data as valid in the mask
- inp_pos += inc;
- is_masked = (mask[inp_pos] != 0);
- }
- }
-}
-
-// copy the values of the elements to definitive position
-kernel void copy_result_int(global int* values,
- global int* indexes,
- int in_size,
- int out_size,
- global int* output
- )
-{
- int gid = get_global_id(0);
- if (gid < in_size)
- {
- int current = max(indexes[gid], 0),
- next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1];
- //we keep always the last element
- if ((current <= out_size) && (current < next))
- {
- output[current] = values[gid];
- }
- }
-}
-
-// copy the values of the elements to definitive position
-kernel void copy_result_float(global int* values,
- global int* indexes,
- int in_size,
- int out_size,
- global float* output
- )
-{
- int gid = get_global_id(0);
- if (gid<in_size)
- {
- int current = max(indexes[gid], 0),
- next = (gid >= (in_size - 1)) ? in_size + 1 : indexes[gid + 1];
- if ((current < out_size) && (current < next))
- {
- output[current] = (float) values[gid];
- }
- }
-}
-
-
-// combined memset for all arrays used for Byte Offset decompression
-kernel void byte_offset_memset(global char* raw,
- global int* mask,
- global int* index,
- global int* result,
- int full_size,
- int actual_size
- )
-{
- int gid = get_global_id(0);
- if (gid < full_size)
- {
- raw[gid] = 0;
- index[gid] = 0;
- result[gid] = 0;
- if (gid<actual_size)
- {
- mask[gid] = 0;
- }
- else
- {
- mask[gid] = 1;
- }
-
- }
-}
-
-
-//Simple memset kernel for char arrays
-kernel void fill_char_mem(global char* ary,
- int size,
- char pattern,
- int start_at)
-{
- int gid = get_global_id(0);
- if ((gid >= start_at) && (gid < size))
- {
- ary[gid] = pattern;
- }
-}
-
-//Simple memset kernel for int arrays
-kernel void fill_int_mem(global int* ary,
- int size,
- int pattern,
- int start_at)
-{
- int gid = get_global_id(0);
- if ((gid >= start_at) && (gid < size))
- {
- ary[gid] = pattern;
- }
-}
-
diff --git a/silx/resources/opencl/convolution.cl b/silx/resources/opencl/convolution.cl
deleted file mode 100644
index 629b8fc..0000000
--- a/silx/resources/opencl/convolution.cl
+++ /dev/null
@@ -1,312 +0,0 @@
-#define MAX_CONST_SIZE 16384
-
-/******************************************************************************/
-/**************************** Macros ******************************************/
-/******************************************************************************/
-
-// Get the center index of the filter,
-// and the "half-Left" and "half-Right" lengths.
-// In the case of an even-sized filter, the center is shifted to the left.
-#define GET_CENTER_HL(hlen){\
- if (hlen & 1) {\
- c = hlen/2;\
- hL = c;\
- hR = c;\
- }\
- else {\
- c = hlen/2 - 1;\
- hL = c;\
- hR = c+1;\
- }\
-}\
-
-// Boundary handling modes
-#define CONV_MODE_REFLECT 0 // cba|abcd|dcb
-#define CONV_MODE_NEAREST 1 // aaa|abcd|ddd
-#define CONV_MODE_WRAP 2 // bcd|abcd|abc
-#define CONV_MODE_CONSTANT 3 // 000|abcd|000
-#ifndef USED_CONV_MODE
- #define USED_CONV_MODE CONV_MODE_NEAREST
-#endif
-
-#define CONV_PERIODIC_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x += Nx; if (idx_x >= Nx) idx_x -= Nx;
-#define CONV_PERIODIC_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y += Ny; if (idx_y >= Ny) idx_y -= Ny;
-#define CONV_PERIODIC_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z += Nz; if (idx_z >= Nz) idx_z -= Nz;
-
-#define CONV_NEAREST_IDX_X int idx_x = clamp((int) (gidx - c + jx), 0, Nx-1);
-#define CONV_NEAREST_IDX_Y int idx_y = clamp((int) (gidy - c + jy), 0, Ny-1);
-#define CONV_NEAREST_IDX_Z int idx_z = clamp((int) (gidz - c + jz), 0, Nz-1);
-
-#define CONV_REFLECT_IDX_X int idx_x = gidx - c + jx; if (idx_x < 0) idx_x = -idx_x-1; if (idx_x >= Nx) idx_x = Nx-(idx_x-(Nx-1));
-#define CONV_REFLECT_IDX_Y int idx_y = gidy - c + jy; if (idx_y < 0) idx_y = -idx_y-1; if (idx_y >= Ny) idx_y = Ny-(idx_y-(Ny-1));
-#define CONV_REFLECT_IDX_Z int idx_z = gidz - c + jz; if (idx_z < 0) idx_z = -idx_z-1; if (idx_z >= Nz) idx_z = Nz-(idx_z-(Nz-1));
-
-
-#if USED_CONV_MODE == CONV_MODE_REFLECT
- #define CONV_IDX_X CONV_REFLECT_IDX_X
- #define CONV_IDX_Y CONV_REFLECT_IDX_Y
- #define CONV_IDX_Z CONV_REFLECT_IDX_Z
-#elif USED_CONV_MODE == CONV_MODE_NEAREST
- #define CONV_IDX_X CONV_NEAREST_IDX_X
- #define CONV_IDX_Y CONV_NEAREST_IDX_Y
- #define CONV_IDX_Z CONV_NEAREST_IDX_Z
-#elif USED_CONV_MODE == CONV_MODE_WRAP
- #define CONV_IDX_X CONV_PERIODIC_IDX_X
- #define CONV_IDX_Y CONV_PERIODIC_IDX_Y
- #define CONV_IDX_Z CONV_PERIODIC_IDX_Z
-#elif USED_CONV_MODE == CONV_MODE_CONSTANT
- #error "constant not implemented yet"
-#else
- #error "Unknown convolution mode"
-#endif
-
-
-
-// Image access patterns
-#define READ_IMAGE_1D_X input[(gidz*Ny + gidy)*Nx + idx_x]
-#define READ_IMAGE_1D_Y input[(gidz*Ny + idx_y)*Nx + gidx]
-#define READ_IMAGE_1D_Z input[(idx_z*Ny + gidy)*Nx + gidx]
-
-#define READ_IMAGE_2D_XY input[(gidz*Ny + idx_y)*Nx + idx_x]
-#define READ_IMAGE_2D_XZ input[(idx_z*Ny + gidy)*Nx + idx_x]
-#define READ_IMAGE_2D_YZ input[(idx_z*Ny + idx_y)*Nx + gidx]
-
-#define READ_IMAGE_3D_XYZ input[(idx_z*Ny + idx_y)*Nx + idx_x]
-
-
-
-/******************************************************************************/
-/**************************** 1D Convolution **********************************/
-/******************************************************************************/
-
-
-// Convolution with 1D kernel along axis "X" (fast dimension)
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "X".
-__kernel void convol_1D_X(
- const __global float * input,
- __global float * output,
- __global float * filter,
- int L, // filter size
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(L);
- float sum = 0.0f;
-
- for (int jx = 0; jx <= hR+hL; jx++) {
- CONV_IDX_X; // Get index "x"
- sum += READ_IMAGE_1D_X * filter[L-1 - jx];
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-// Convolution with 1D kernel along axis "Y"
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y".
-__kernel void convol_1D_Y(
- const __global float * input,
- __global float * output,
- __global float * filter,
- int L, // filter size
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(L);
- float sum = 0.0f;
-
- for (int jy = 0; jy <= hR+hL; jy++) {
- CONV_IDX_Y; // Get index "y"
- sum += READ_IMAGE_1D_Y * filter[L-1 - jy];
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-// Convolution with 1D kernel along axis "Z"
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z".
-__kernel void convol_1D_Z(
- const __global float * input,
- __global float * output,
- __global float * filter,
- int L, // filter size
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(L);
- float sum = 0.0f;
-
- for (int jz = 0; jz <= hR+hL; jz++) {
- CONV_IDX_Z; // Get index "z"
- sum += READ_IMAGE_1D_Z * filter[L-1 - jz];
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-/******************************************************************************/
-/**************************** 2D Convolution **********************************/
-/******************************************************************************/
-
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_XY(
- const __global float * input,
- __global float * output,
- __global float * filter,
- int Lx, // filter number of columns,
- int Ly, // filter number of rows,
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Lx);
- float sum = 0.0f;
-
- for (int jy = 0; jy <= hR+hL; jy++) {
- CONV_IDX_Y; // Get index "y"
- for (int jx = 0; jx <= hR+hL; jx++) {
- CONV_IDX_X; // Get index "x"
- sum += READ_IMAGE_2D_XY * filter[(Ly-1-jy)*Lx + (Lx-1 - jx)];
- }
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_XZ(
- const __global float * input,
- __global float * output,
- __global float * filter,
- int Lx, // filter number of columns,
- int Lz, // filter number of rows,
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Lx);
- float sum = 0.0f;
-
- for (int jz = 0; jz <= hR+hL; jz++) {
- CONV_IDX_Z; // Get index "z"
- for (int jx = 0; jx <= hR+hL; jx++) {
- CONV_IDX_X; // Get index "x"
- sum += READ_IMAGE_2D_XZ * filter[(Lz-1-jz)*Lx + (Lx-1 - jx)];
- }
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_YZ(
- const __global float * input,
- __global float * output,
- __global float * filter,
- int Ly, // filter number of columns,
- int Lz, // filter number of rows,
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Ly);
- float sum = 0.0f;
-
- for (int jz = 0; jz <= hR+hL; jz++) {
- CONV_IDX_Z; // Get index "z"
- for (int jy = 0; jy <= hR+hL; jy++) {
- CONV_IDX_Y; // Get index "y"
- sum += READ_IMAGE_2D_YZ * filter[(Lz-1-jz)*Ly + (Ly-1 - jy)];
- }
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-
-/******************************************************************************/
-/**************************** 3D Convolution **********************************/
-/******************************************************************************/
-
-// Convolution with 3D kernel
-__kernel void convol_3D_XYZ(
- const __global float * input,
- __global float * output,
- __global float * filter,
- int Lx, // filter number of columns,
- int Ly, // filter number of rows,
- int Lz, // filter number of rows,
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Lx);
- float sum = 0.0f;
-
- for (int jz = 0; jz <= hR+hL; jz++) {
- CONV_IDX_Z; // Get index "z"
- for (int jy = 0; jy <= hR+hL; jy++) {
- CONV_IDX_Y; // Get index "y"
- for (int jx = 0; jx <= hR+hL; jx++) {
- CONV_IDX_X; // Get index "x"
- sum += READ_IMAGE_3D_XYZ * filter[((Lz-1-jz)*Ly + (Ly-1-jy))*Lx + (Lx-1 - jx)];
- }
- }
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
diff --git a/silx/resources/opencl/convolution_textures.cl b/silx/resources/opencl/convolution_textures.cl
deleted file mode 100644
index 517a67c..0000000
--- a/silx/resources/opencl/convolution_textures.cl
+++ /dev/null
@@ -1,374 +0,0 @@
-/******************************************************************************/
-/**************************** Macros ******************************************/
-/******************************************************************************/
-
-// Error handling
-#ifndef IMAGE_DIMS
- #error "IMAGE_DIMS must be defined"
-#endif
-#ifndef FILTER_DIMS
- #error "FILTER_DIMS must be defined"
-#endif
-#if FILTER_DIMS > IMAGE_DIMS
- #error "Filter cannot have more dimensions than image"
-#endif
-
-// Boundary handling modes
-#define CONV_MODE_REFLECT 0 // CLK_ADDRESS_MIRRORED_REPEAT : cba|abcd|dcb
-#define CONV_MODE_NEAREST 1 // CLK_ADDRESS_CLAMP_TO_EDGE : aaa|abcd|ddd
-#define CONV_MODE_WRAP 2 // CLK_ADDRESS_REPEAT : bcd|abcd|abc
-#define CONV_MODE_CONSTANT 3 // CLK_ADDRESS_CLAMP : 000|abcd|000
-#ifndef USED_CONV_MODE
- #define USED_CONV_MODE CONV_MODE_NEAREST
-#endif
-#if USED_CONV_MODE == CONV_MODE_REFLECT
- #define CLK_BOUNDARY CLK_ADDRESS_MIRRORED_REPEAT
- #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE
- #define USE_NORM_COORDS
-#elif USED_CONV_MODE == CONV_MODE_NEAREST
- #define CLK_BOUNDARY CLK_ADDRESS_CLAMP_TO_EDGE
- #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE
-#elif USED_CONV_MODE == CONV_MODE_WRAP
- #define CLK_BOUNDARY CLK_ADDRESS_REPEAT
- #define CLK_COORDS CLK_NORMALIZED_COORDS_TRUE
- #define USE_NORM_COORDS
-#elif USED_CONV_MODE == CONV_MODE_CONSTANT
- #define CLK_BOUNDARY CLK_ADDRESS_CLAMP
- #define CLK_COORDS CLK_NORMALIZED_COORDS_FALSE
-#else
- #error "Unknown convolution mode"
-#endif
-
-
-
-// Convolution index for filter
-#define FILTER_INDEX(j) (Lx - 1 - j)
-
-// Filter access patterns
-#define READ_FILTER_1D(j) read_imagef(filter, (int2) (FILTER_INDEX(j), 0)).x;
-#define READ_FILTER_2D(jx, jy) read_imagef(filter, (int2) (FILTER_INDEX(jx), FILTER_INDEX(jy))).x;
-#define READ_FILTER_3D(jx, jy, jz) read_imagef(filter, (int4) (FILTER_INDEX(jx), FILTER_INDEX(jy), FILTER_INDEX(jz), 0)).x;
-
-
-// Convolution index for image
-#ifdef USE_NORM_COORDS
- #define IMAGE_INDEX_X (gidx*1.0f +0.5f - c + jx)/Nx
- #define IMAGE_INDEX_Y (gidy*1.0f +0.5f - c + jy)/Ny
- #define IMAGE_INDEX_Z (gidz*1.0f +0.5f - c + jz)/Nz
- #define RET_TYPE_1 float
- #define RET_TYPE_2 float2
- #define RET_TYPE_4 float4
- #define C_ZERO 0.5f
- #define GIDX (gidx*1.0f + 0.5f)/Nx
- #define GIDY (gidy*1.0f + 0.5f)/Ny
- #define GIDZ (gidz*1.0f + 0.5f)/Nz
-#else
- #define IMAGE_INDEX_X (gidx - c + jx)
- #define IMAGE_INDEX_Y (gidy - c + jy)
- #define IMAGE_INDEX_Z (gidz - c + jz)
- #define RET_TYPE_1 int
- #define RET_TYPE_2 int2
- #define RET_TYPE_4 int4
- #define C_ZERO 0
- #define GIDX gidx
- #define GIDY gidy
- #define GIDZ gidz
-#endif
-
-static const sampler_t sampler = CLK_COORDS | CLK_BOUNDARY | CLK_FILTER_NEAREST;
-
-// Image access patterns
-#define READ_IMAGE_1D read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, C_ZERO)).x
-
-#define READ_IMAGE_2D_X read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X , GIDY)).x
-#define READ_IMAGE_2D_Y read_imagef(input, sampler, (RET_TYPE_2) (GIDX, IMAGE_INDEX_Y)).x
-#define READ_IMAGE_2D_XY read_imagef(input, sampler, (RET_TYPE_2) (IMAGE_INDEX_X, IMAGE_INDEX_Y)).x
-
-#define READ_IMAGE_3D_X read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, GIDZ, C_ZERO)).x
-#define READ_IMAGE_3D_Y read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x
-#define READ_IMAGE_3D_Z read_imagef(input, sampler, (RET_TYPE_4) (GIDX, GIDY, IMAGE_INDEX_Z, C_ZERO)).x
-#define READ_IMAGE_3D_XY read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, GIDZ, C_ZERO)).x
-#define READ_IMAGE_3D_XZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, GIDY, IMAGE_INDEX_Z, C_ZERO)).x
-#define READ_IMAGE_3D_YZ read_imagef(input, sampler, (RET_TYPE_4) (GIDX, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x
-#define READ_IMAGE_3D_XYZ read_imagef(input, sampler, (RET_TYPE_4) (IMAGE_INDEX_X, IMAGE_INDEX_Y, IMAGE_INDEX_Z, C_ZERO)).x
-
-
-// NOTE: pyopencl and OpenCL < 1.2 do not support image1d_t
-#if FILTER_DIMS == 1
- #define FILTER_TYPE image2d_t
- #define READ_FILTER_VAL(j) READ_FILTER_1D(j)
-#elif FILTER_DIMS == 2
- #define FILTER_TYPE image2d_t
- #define READ_FILTER_VAL(jx, jy) READ_FILTER_2D(jx, jy)
-#elif FILTER_DIMS == 3
- #define FILTER_TYPE image3d_t
- #define READ_FILTER_VAL(jx, jy, jz) READ_FILTER_3D(jx, jy, jz)
-#endif
-
-#if IMAGE_DIMS == 1
- #define IMAGE_TYPE image2d_t
- #define READ_IMAGE_X READ_IMAGE_1D
-#elif IMAGE_DIMS == 2
- #define IMAGE_TYPE image2d_t
- #define READ_IMAGE_X READ_IMAGE_2D_X
- #define READ_IMAGE_Y READ_IMAGE_2D_Y
- #define READ_IMAGE_XY READ_IMAGE_2D_XY
-#elif IMAGE_DIMS == 3
- #define IMAGE_TYPE image3d_t
- #define READ_IMAGE_X READ_IMAGE_3D_X
- #define READ_IMAGE_Y READ_IMAGE_3D_Y
- #define READ_IMAGE_Z READ_IMAGE_3D_Z
- #define READ_IMAGE_XY READ_IMAGE_3D_XY
- #define READ_IMAGE_XZ READ_IMAGE_3D_XZ
- #define READ_IMAGE_YZ READ_IMAGE_3D_YZ
- #define READ_IMAGE_XYZ READ_IMAGE_3D_XYZ
-#endif
-
-
-// Get the center index of the filter,
-// and the "half-Left" and "half-Right" lengths.
-// In the case of an even-sized filter, the center is shifted to the left.
-#define GET_CENTER_HL(hlen){\
- if (hlen & 1) {\
- c = hlen/2;\
- hL = c;\
- hR = c;\
- }\
- else {\
- c = hlen/2 - 1;\
- hL = c;\
- hR = c+1;\
- }\
-}\
-
-
-
-/******************************************************************************/
-/**************************** 1D Convolution **********************************/
-/******************************************************************************/
-
-#if FILTER_DIMS == 1
-// Convolution with 1D kernel along axis "X" (fast dimension)
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "X".
-__kernel void convol_1D_X_tex(
- read_only IMAGE_TYPE input,
- __global float * output,
- read_only FILTER_TYPE filter,
- int Lx, // filter size
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Lx);
- float sum = 0.0f;
-
- for (int jx = 0; jx <= hR+hL; jx++) {
- sum += READ_IMAGE_X * READ_FILTER_VAL(jx);
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-#if IMAGE_DIMS >= 2
-// Convolution with 1D kernel along axis "Y"
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "Y".
-__kernel void convol_1D_Y_tex(
- read_only IMAGE_TYPE input,
- __global float * output,
- read_only FILTER_TYPE filter,
- int Lx, // filter size
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Lx);
- float sum = 0.0f;
-
- for (int jy = 0; jy <= hR+hL; jy++) {
- sum += READ_IMAGE_Y * READ_FILTER_VAL(jy);
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-#endif
-
-#if IMAGE_DIMS == 3
-// Convolution with 1D kernel along axis "Z"
-// Works for batched 1D on 2D and batched 2D on 3D, along axis "Z".
-__kernel void convol_1D_Z_tex(
- read_only IMAGE_TYPE input,
- __global float * output,
- read_only FILTER_TYPE filter,
- int Lx, // filter size
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Lx);
- float sum = 0.0f;
-
- for (int jz = 0; jz <= hR+hL; jz++) {
- sum += READ_IMAGE_Z * READ_FILTER_VAL(jz);
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-#endif
-#endif
-
-/******************************************************************************/
-/**************************** 2D Convolution **********************************/
-/******************************************************************************/
-
-#if IMAGE_DIMS >= 2 && FILTER_DIMS == 2
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_XY_tex(
- read_only IMAGE_TYPE input,
- __global float * output,
- read_only FILTER_TYPE filter,
- int Lx, // filter number of columns,
- int Ly, // filter number of rows,
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Lx);
- float sum = 0.0f;
-
- for (int jy = 0; jy <= hR+hL; jy++) {
- for (int jx = 0; jx <= hR+hL; jx++) {
- sum += READ_IMAGE_XY * READ_FILTER_VAL(jx, jy);
- }
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-#endif
-
-#if IMAGE_DIMS == 3 && FILTER_DIMS == 2
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_XZ_tex(
- read_only IMAGE_TYPE input,
- __global float * output,
- read_only FILTER_TYPE filter,
- int Lx, // filter number of columns,
- int Lz, // filter number of rows,
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Lx);
- float sum = 0.0f;
-
- for (int jz = 0; jz <= hR+hL; jz++) {
- for (int jx = 0; jx <= hR+hL; jx++) {
- sum += READ_IMAGE_XZ * READ_FILTER_VAL(jx, jz);
- }
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-
-
-// Convolution with 2D kernel
-// Works for batched 2D on 3D.
-__kernel void convol_2D_YZ_tex(
- read_only IMAGE_TYPE input,
- __global float * output,
- read_only FILTER_TYPE filter,
- int Lx, // filter number of columns,
- int Lz, // filter number of rows,
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Lx);
- float sum = 0.0f;
-
- for (int jz = 0; jz <= hR+hL; jz++) {
- for (int jy = 0; jy <= hR+hL; jy++) {
- sum += READ_IMAGE_YZ * READ_FILTER_VAL(jy, jz);
- }
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-#endif
-
-
-/******************************************************************************/
-/**************************** 3D Convolution **********************************/
-/******************************************************************************/
-
-#if IMAGE_DIMS == 3 && FILTER_DIMS == 3
-// Convolution with 3D kernel
-__kernel void convol_3D_XYZ_tex(
- read_only IMAGE_TYPE input,
- __global float * output,
- read_only FILTER_TYPE filter,
- int Lx, // filter number of columns,
- int Ly, // filter number of rows,
- int Lz, // filter number of rows,
- int Nx, // input/output number of columns
- int Ny, // input/output number of rows
- int Nz // input/output depth
-)
-{
- uint gidx = get_global_id(0);
- uint gidy = get_global_id(1);
- uint gidz = get_global_id(2);
- if ((gidx >= Nx) || (gidy >= Ny) || (gidz >= Nz)) return;
-
- int c, hL, hR;
- GET_CENTER_HL(Lx);
- float sum = 0.0f;
-
- for (int jz = 0; jz <= hR+hL; jz++) {
- for (int jy = 0; jy <= hR+hL; jy++) {
- for (int jx = 0; jx <= hR+hL; jx++) {
- sum += READ_IMAGE_XYZ * READ_FILTER_VAL(jx, jy, jz);
- }
- }
- }
- output[(gidz*Ny + gidy)*Nx + gidx] = sum;
-}
-#endif
diff --git a/silx/resources/opencl/doubleword.cl b/silx/resources/opencl/doubleword.cl
deleted file mode 100644
index a0ebfda..0000000
--- a/silx/resources/opencl/doubleword.cl
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * OpenCL library for double word floating point calculation using compensated arithmetics
- *
- * The theoritical basis can be found in Valentina Popescu's PhD thesis:
- * Towards fast and certified multi-precision libraries
- * Reference LYSEN036
- * http://www.theses.fr/2017LYSEN036
- * All page number and equation number are refering to this document.
- *
- * The precision of the calculation (bounds) is provided in ULP (smallest possible mantissa)
- * and come from the table 2.2 (page 68 of the thesis).
- * The number of equivalent FLOP is taken from the table 2.3 (page 69 the thesis).
- * Note that FLOP are not all equal: a division is much more expensive than an addition.
- */
-
-//This library can be expanded to double-double by redefining fp, fp2 and one to double, double2 and 1.0.
-#ifdef DOUBLEDOUBLE
-#define fp double
-#define fp2 double2
-#define one 1.0
-#else
-#define fp float
-#define fp2 float2
-#define one 1.0f
-#endif
-
-/* Nota: i386 computer use x87 registers which are larger than the 32bits precision
- * which can invalidate the error compensation mechanism.
- *
- * We use the trick to declare some variable "volatile" to enforce the actual
- * precision reduction of those variables.
-*/
-
-#ifndef X87_VOLATILE
-# define X87_VOLATILE
-#endif
-
-//Algorithm 1, p23, theorem 1.1.12. Requires e_x > e_y, valid if |x| > |y|
-inline fp2 fast_fp_plus_fp(fp x, fp y){
- X87_VOLATILE fp s = x + y;
- X87_VOLATILE fp z = s - x;
- fp e = y - z;
- return (fp2)(s, e);
-}
-
-//Algorithm 2, p24, same as fast_fp_plus_fp without the condition on e_x and e_y
-inline fp2 fp_plus_fp(fp x, fp y){
- X87_VOLATILE fp s = x + y;
- X87_VOLATILE fp xp = s - y;
- X87_VOLATILE fp yp = s - xp;
- X87_VOLATILE fp dx = x - xp;
- X87_VOLATILE fp dy = y - yp;
- return (fp2)(s, dx+dy);
-}
-
-//Algorithm 3, p24: multiplication with a FMA
-inline fp2 fp_times_fp(fp x, fp y){
- fp p = x * y;
- fp e = fma(x, y, -p);
- return (fp2)(p, e);
-}
-
-//Algorithm 7, p38: Addition of a FP to a DW. 10flop bounds:2u²+5u³
-inline fp2 dw_plus_fp(fp2 x, fp y){
- fp2 s = fp_plus_fp(x.s0, y);
- X87_VOLATILE fp v = x.s1 + s.s1;
- return fast_fp_plus_fp(s.s0, v);
-}
-
-//Algorithm 9, p40: addition of two DW: 20flop bounds:3u²+13u³
-inline fp2 dw_plus_dw(fp2 x, fp2 y){
- fp2 s = fp_plus_fp(x.s0, y.s0);
- fp2 t = fp_plus_fp(x.s1, y.s1);
- fp2 v = fast_fp_plus_fp(s.s0, s.s1 + t.s0);
- return fast_fp_plus_fp(v.s0, t.s1 + v.s1);
-}
-
-//Algorithm 12, p49: Multiplication FP*DW: 6flops bounds:2u²
-inline fp2 dw_times_fp(fp2 x, fp y){
- fp2 c = fp_times_fp(x.s0, y);
- return fast_fp_plus_fp(c.s0, fma(x.s1, y, c.s1));
-}
-
-//Algorithm 14, p52: Multiplication DW*DW, 8 flops bounds:6u²
-inline fp2 dw_times_dw(fp2 x, fp2 y){
- fp2 c = fp_times_fp(x.s0, y.s0);
- X87_VOLATILE fp l = fma(x.s1, y.s0, x.s0 * y.s1);
- return fast_fp_plus_fp(c.s0, c.s1 + l);
-}
-
-//Algorithm 17, p55: Division DW / FP, 10flops bounds: 3.5u²
-inline fp2 dw_div_fp(fp2 x, fp y){
- X87_VOLATILE fp th = x.s0 / y;
- fp2 pi = fp_times_fp(th, y);
- fp2 d = x - pi;
- X87_VOLATILE fp delta = d.s0 + d.s1;
- X87_VOLATILE fp tl = delta/y;
- return fast_fp_plus_fp(th, tl);
-}
-
-//Derived from algorithm 20, p64: Inversion 1/ DW, 22 flops
-inline fp2 inv_dw(fp2 y){
- X87_VOLATILE fp th = one/y.s0;
- X87_VOLATILE fp rh = fma(-y.s0, th, one);
- X87_VOLATILE fp rl = -y.s1 * th;
- fp2 e = fast_fp_plus_fp(rh, rl);
- fp2 delta = dw_times_fp(e, th);
- return dw_plus_fp(delta, th);
-}
-
-//Algorithm 20, p64: Division DW / DW, 30 flops: bounds:9.8u²
-inline fp2 dw_div_dw(fp2 x, fp2 y){
- return dw_times_dw(x, inv_dw(y));
-}
-
diff --git a/silx/resources/opencl/image/cast.cl b/silx/resources/opencl/image/cast.cl
deleted file mode 100644
index 9e23a82..0000000
--- a/silx/resources/opencl/image/cast.cl
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Project: SILX: Alogorithms for image processing
- *
- * Copyright (C) 2013-2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef NB_COLOR
- #define NB_COLOR 1
-#endif
-
-
-/**
- * \brief Cast values of an array of uint8 into a float output array.
- *
- * :param array_input: Pointer to global memory with the input data as unsigned8 array
- * :param array_float: Pointer to global memory with the output data as float array
- * :param width: Width of the image
- * :param height: Height of the image
- */
-kernel void u8_to_float( global unsigned char *array_input,
- global float *array_float,
- const int width,
- const int height)
-{
- //Global memory guard for padding
- if ((get_global_id(0) < width) && (get_global_id(1)<height))
- {
- int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
- for (int c=0; c<NB_COLOR; c++)
- {
- array_float[i + c] = (float) array_input[i + c];
- } //end loop over colors
- } //end test in image
-} //end kernel
-
-/**
- * \brief Cast values of an array of uint8 into a float output array.
- *
- * :param array_input: Pointer to global memory with the input data as signed8 array
- * :param array_float: Pointer to global memory with the output data as float array
- * :param width: Width of the image
- * :param height: Height of the image
- */
-kernel void s8_to_float( global char *array_input,
- global float *array_float,
- const int width,
- const int height)
-{
- //Global memory guard for padding
- if ((get_global_id(0) < width) && (get_global_id(1)<height))
- {
- int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
- for (int c=0; c<NB_COLOR; c++)
- {
- array_float[i + c] = (float) array_input[i + c];
- } //end loop over colors
- } //end test in image
-} //end kernel
-
-
-/**
- * \brief cast values of an array of uint16 into a float output array.
- *
- * :param array_input: Pointer to global memory with the input data as unsigned16 array
- * :param array_float: Pointer to global memory with the output data as float array
- * :param width: Width of the image
- * :param height: Height of the image
- */
-kernel void u16_to_float(global unsigned short *array_input,
- global float *array_float,
- const int width,
- const int height)
-{
- //Global memory guard for padding
- if ((get_global_id(0) < width) && (get_global_id(1)<height))
- {
- int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
- for (int c=0; c<NB_COLOR; c++)
- {
- array_float[i + c] = (float) array_input[i + c];
- } //end loop over colors
- } //end test in image
-} //end kernel
-
-/**
- * \brief cast values of an array of int16 into a float output array.
- *
- * :param array_input: Pointer to global memory with the input data as signed16 array
- * :param array_float: Pointer to global memory with the output data as float array
- * :param width: Width of the image
- * :param height: Height of the image
- */
-kernel void s16_to_float(global short *array_input,
- global float *array_float,
- const int width,
- const int height)
-{
- //Global memory guard for padding
- if ((get_global_id(0) < width) && (get_global_id(1)<height))
- {
- int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
- for (int c=0; c<NB_COLOR; c++)
- {
- array_float[i + c] = (float) array_input[i + c];
- } //end loop over colors
- } //end test in image
-}//end kernel
-
-/**
- * \brief cast values of an array of uint32 into a float output array.
- *
- * :param array_input: Pointer to global memory with the input data as unsigned32 array
- * :param array_float: Pointer to global memory with the output data as float array
- * :param width: Width of the image
- * :param height: Height of the image
- */
-kernel void u32_to_float(global unsigned int *array_input,
- global float *array_float,
- const int width,
- const int height)
-{
- //Global memory guard for padding
- if ((get_global_id(0) < width) && (get_global_id(1)<height))
- {
- int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
- for (int c=0; c<NB_COLOR; c++)
- {
- array_float[i + c] = (float) array_input[i + c];
- } //end loop over colors
- } //end test in image
-}//end kernel
-
-
-/**
- * \brief convert values of an array of int32 into a float output array.
- *
- * :param array_input: Pointer to global memory with the data in int
- * :param array_float: Pointer to global memory with the data in float
- * :param width: Width of the image
- * :param height: Height of the image
- */
-kernel void s32_to_float(global int *array_input,
- global float *array_float,
- const int width,
- const int height)
-{
- //Global memory guard for padding
- if ((get_global_id(0) < width) && (get_global_id(1)<height))
- {
- int i = NB_COLOR * (get_global_id(0) + width * get_global_id(1));
- for (int c=0; c<NB_COLOR; c++)
- {
- array_float[i + c] = (float) array_input[i + c];
- } //end loop over colors
- } //end test in image
-}//end kernel
-
diff --git a/silx/resources/opencl/image/histogram.cl b/silx/resources/opencl/image/histogram.cl
deleted file mode 100644
index 7fb1485..0000000
--- a/silx/resources/opencl/image/histogram.cl
+++ /dev/null
@@ -1,178 +0,0 @@
-//CL//
-
-/*
- * Project: SILX: Alogorithms for image processing
- *
- * Copyright (C) 2013-2018 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/* Single kernel histogram for float array.
- *
- * Can perform histograming in log-scale (using the arcsinh)
-
-Parameters:
- - data: buffer with the image content in float (input)
- - data_size: input
- - mini: Lower bound of the first bin
- - maxi: upper bouns of the last bin
- - map_operation: Type of pre-processor to use. if if set to !=0, use log scale
- - hist: resulting histogram (ouptut)
- - hist_size: number of bins
- - tmp_hist: temporary storage of size hist_size*num_groups
- - processed: temporary storage of size 1 initialially set to 0
- - local_hist: local storage of size hist_size
-
-
-Grid information:
- * use the largest WG size. If it is larger than the number of bins (hist_size),
- take the power of 2 immediately above
- *Schedule as many WG as the device has compute engines. No need to schuedule more,
- it is just a waist of memory
- * The histogram should fit in shared (local) memory and tmp_hist can be large!
-
-Assumes:
- hist and local_hist have size hist_size
- edges has size hist_size+2
- tmp_hist has size hist_size*num_groups
- processed is of size one and initially set to 0
-
-*/
-
-
-static float preprocess(float value, int code)
-{
- //This function can be modified to have more scales
- if (code!=0)
- {
- return asinh(value);
- }
- else
- {
- return value;
- }
-}
-
-kernel void histogram(global float *data,
- int data_size,
- float mini,
- float maxi,
- int map_operation,
- global int *hist,
- global float *edges,
- int hist_size,
- global int *tmp_hist,
- global int *processed,
- local int *local_hist)
-{
- // each thread
- int lid = get_local_id(0);
- int ws = get_local_size(0);
- int nb_engine = get_num_groups(0);
- int engine_id = get_group_id(0);
-
- // memset the local_hist array
- for (int i=0; i<hist_size; i+=ws)
- {
- int j = i + lid;
- if (j<hist_size)
- {
- local_hist[j] = 0;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Process the local data and accumulate in shared memory
- int bloc_size = (int) ceil((float)data_size/(float)nb_engine);
- int start = bloc_size * engine_id;
- int stop = min(start + bloc_size, data_size);
- float vmini = preprocess(mini, map_operation);
- float vscale = (float)hist_size/(preprocess(maxi, map_operation) -vmini);
- for (int i = start; i<stop; i+=ws)
- {
- int address = i + lid;
- if (address < stop)
- {
- float value = data[address];
- if ((!isnan(value)) && (value>=mini) && (value<=maxi))
- {
- float vvalue = (preprocess(value, map_operation)-vmini)*vscale;
- int idx = clamp((int) vvalue, 0, hist_size - 1);
- atomic_inc(&local_hist[idx]);
- }
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //Now copy result into the right place and reset the first value of the shared array
- for (int i=0; i<hist_size; i+=ws)
- {
- int j = i + lid;
- if (j<hist_size)
- {
- tmp_hist[hist_size * engine_id + j] = local_hist[j];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- local_hist[0] = 0;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //Increment the system wide shared variable processed and share the result with the workgroup
- if (lid == 0)
- {
- local_hist[0] = atomic_inc(processed);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // If we are the engine last to work, perform the concatenation of all results
-
- if ((local_hist[0] + 1) == nb_engine)
- {
- for (int i=0; i<hist_size; i+=ws)
- {
- int j = i + lid;
- int lsum = 0;
- if (j<hist_size)
- {
- for (int k=0; k<nb_engine; k++)
- {
- lsum += tmp_hist[hist_size * k + j];
- }
- hist[j] = lsum;
- edges[j] = vmini + j/vscale;
- }
- }
- // Finally reset the counter
- if (lid == 0)
- {
- processed[0] = 0;
- edges[hist_size] = vmini + hist_size/vscale;;
- }
-
- }
-}
diff --git a/silx/resources/opencl/image/map.cl b/silx/resources/opencl/image/map.cl
deleted file mode 100644
index 804a5a1..0000000
--- a/silx/resources/opencl/image/map.cl
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Project: SILX: Data analysis library
- * kernel for maximum and minimum calculation
- *
- *
- * Copyright (C) 2013-2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- *
- */
-
-#ifndef NB_COLOR
- #define NB_COLOR 1
-#endif
-
-
-/**
- * \brief Linear scale of signal in an image (maybe multi-color)
- *
- * :param image: contains the input image
- * :param output: contains the output image after scaling
- * :param max_min_input: 2-floats containing the maximum and the minimum value of image
- * :param minimum: float scalar with the minimum of the output
- * :param maximum: float scalar with the maximum of the output
- * :param width: integer, number of columns the matrices
- * :param height: integer, number of lines of the matrices
- *
- *
- */
-
-
-kernel void normalize_image(global float* image,
- global float* output,
- int width,
- int height,
- global float* max_min_input,
- float minimum,
- float maximum
- )
-{
- //Global memory guard for padding
- if((get_global_id(0) < width) && (get_global_id(1)<height))
- {
- int idx = NB_COLOR* (get_global_id(0) + width * get_global_id(1));
- float mini_in, maxi_in, scale;
- maxi_in = max_min_input[0];
- mini_in = max_min_input[1];
- if (maxi_in == mini_in)
- {
- scale = NAN;
- }
- else
- {
- scale = (maximum - minimum) / (maxi_in - mini_in);
- }
-
- for (int c=0; c<NB_COLOR; c++)
- {
- output[idx+c] = fma(scale, image[idx+c]-mini_in, minimum);
- } // end color loop
- }//end if in IMAGE
-}//end kernel
diff --git a/silx/resources/opencl/image/max_min.cl b/silx/resources/opencl/image/max_min.cl
deleted file mode 100644
index 246cd48..0000000
--- a/silx/resources/opencl/image/max_min.cl
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Project: SILX: Data analysis library
- * kernel for maximum and minimum calculation
- *
- *
- * Copyright (C) 2013-2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: J. Kieffer (kieffer@esrf.fr)
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- *
- */
-
-#ifndef NB_COLOR
- #define NB_COLOR 1
-#endif
-
-
-
-#define REDUCE(a, b) ((float2) (fmax(a.x, b.x), fmin(a.y, b.y)))
-
-static float2 read_and_map(int idx,
- global float* data)
-{
- idx *= NB_COLOR;
- float tmp = data[idx];
- float2 res = (float2) (tmp, tmp);
- if (NB_COLOR > 1)
- {
- for (int c=1; c<NB_COLOR; c++)
- {
- tmp = data[idx + c];
- res = (float2) (fmax(res.x, tmp), fmin(res.y, tmp));
- }
- }
- return res;
-}
-
-
-/**
- * \brief max_min_global_stage1: Look for the maximum an the minimum of an array. stage1
- *
- * optimal workgroup size: 2^n greater than sqrt(size), limited to 512
- * optimal total item size: (workgroup size)^2
- * if size >total item size: adjust seq_count.
- *
- * :param data: Float pointer to global memory storing the vector of data.
- * :param out: Float2 pointer to global memory storing the temporary results (workgroup size)
- * :param seq_count: how many blocksize each thread should read
- * :param size: size of the problem (number of element in the image
- * :param l_data Shared memory: 2 float per thread in workgroup
- *
-**/
-
-
-kernel void max_min_reduction_stage1( global const float *data,
- global float2 *out,
- int size,
- local float2 *l_data)// local storage 2 float per thread
-{
- int group_size = get_local_size(0);
- int lid = get_local_id(0);
- float2 acc;
- int big_block = group_size * get_num_groups(0);
- int i = lid + group_size * get_group_id(0);
-
- if (lid<size)
- acc = read_and_map(lid, data);
- else
- acc = read_and_map(0, data);
-
- // Linear pre-reduction stage 0
-
- while (i<size){
- acc = REDUCE(acc, read_and_map(i, data));
- i += big_block;
- }
-
- // parallel reduction stage 1
-
- l_data[lid] = acc;
- barrier(CLK_LOCAL_MEM_FENCE);
- for (int block=group_size/2; block>1; block/=2)
- {
- if ((lid < block) && ((lid + block)<group_size)){
- l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if (lid == 0)
- {
- if (group_size > 1)
- {
- acc = REDUCE(l_data[0], l_data[1]);
- }
- else
- {
- acc = l_data[0];
- }
- out[get_group_id(0)] = acc;
- }
-}
-
-
-/**
- * \brief global_max_min: Look for the maximum an the minimum of an array.
- *
- *
- *
- * :param data2: Float2 pointer to global memory storing the vector of pre-reduced data (workgroup size).
- * :param maximum: Float pointer to global memory storing the maximum value
- * :param minumum: Float pointer to global memory storing the minimum value
- * :param l_data Shared memory: 2 float per thread in workgroup
- *
-**/
-
-kernel void max_min_reduction_stage2(
- global const float2 *data2,
- global float2 *maxmin,
- local float2 *l_data)// local storage 2 float per thread
-{
- int lid = get_local_id(0);
- int group_size = get_local_size(0);
- float2 acc = (float2)(-1.0f, -1.0f);
- if (lid<=group_size)
- {
- l_data[lid] = data2[lid];
- }
- else
- {
- l_data[lid] = acc;
- }
-
- // parallel reduction stage 2
-
-
- barrier(CLK_LOCAL_MEM_FENCE);
- for (int block=group_size/2; block>1; block/=2)
- {
- if ((lid < block) && ((lid + block)<group_size))
- {
- l_data[lid] = REDUCE(l_data[lid], l_data[lid + block]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- }
-
- if (lid == 0 )
- {
- if ( group_size > 1)
- {
- acc = REDUCE(l_data[0], l_data[1]);
- }
- else
- {
- acc = l_data[0];
- }
- maxmin[0] = acc;
- }
-}
-
-/*This is the serial version of the min_max kernel.
- *
- * It has to be launched with WG=1 and only 1 WG has to be launched !
- *
- * :param data: Float pointer to global memory storing the vector of data.
- * :param size: size of the
- * :param maximum: Float pointer to global memory storing the maximum value
- * :param minumum: Float pointer to global memory storing the minimum value
- *
- *
- */
-kernel void max_min_serial(
- global const float *data,
- unsigned int size,
- global float2 *maxmin
- )
-{
- float2 acc = read_and_map(0, data);
- for (int i=1; i<size; i++)
- {
- acc = REDUCE(acc, read_and_map(i, data));
- }
-
- maxmin[0] = acc;
-}
diff --git a/silx/resources/opencl/kahan.cl b/silx/resources/opencl/kahan.cl
deleted file mode 100644
index c23d84d..0000000
--- a/silx/resources/opencl/kahan.cl
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * OpenCL library for 32-bits floating point calculation using compensated arithmetics
- */
-
-/* Nota: i386 computer use x87 registers which are larger than the 32bits precision
- * which can invalidate the error compensation mechanism.
- *
- * We use the trick to declare some variable "volatile" to enforce the actual
- * precision reduction of those variables.
-*/
-
-#ifndef X87_VOLATILE
-# define X87_VOLATILE
-#endif
-
-// calculate acc.s0+value with error compensation
-// see https://en.wikipedia.org/wiki/Kahan_summation_algorithm
-// unlike wikipedia, here the exact value = acc.s0 + acc.s1 (performed in higher precision)
-
-static inline float2 kahan_sum(float2 acc, float value)
-{
- if (value)
- {
- float sum = acc.s0;
- float err = acc.s1;
- if (fabs(value) > fabs(sum))
- {
- float tmp = sum;
- sum = value;
- value = tmp;
- }
-
- float cor = value + err;
- X87_VOLATILE float target = sum + cor;
- err = cor - (target - sum);
- return (float2)(target, err);
- }
- else
- {
- return acc;
- }
-}
-
-// calculate a*b with error compensation
-// see https://hal.archives-ouvertes.fr/hal-01367769/document
-static inline float2 comp_prod(float a, float b)
-{
- X87_VOLATILE float x = a * b;
- float y = fma(a, b, -x);
- return (float2)(x, y);
-}
-
-// calculate a + b with error compensation
-static inline float2 compensated_sum(float2 a, float2 b)
-{
- float err = a.s1 + b.s1;
- float first = a.s0;
- float second = b.s0;
- if (fabs(second) > fabs(first))
- {//swap first and second
- float tmp = first;
- first = second;
- second = tmp;
- }
- float cor = second + err;
- X87_VOLATILE float target = first + cor;
- err = cor - (target - first);
- return (float2)(target, err);
-}
-
-// calculate a * b with error compensation
-static inline float2 compensated_mul(float2 a, float2 b)
-{
- float2 tmp;
- tmp = kahan_sum((float2)(a.s0*b.s0, a.s0*b.s1), a.s1*b.s0);
- tmp = kahan_sum(tmp, a.s1*b.s1);
- return tmp;
-}
-
-// calculate 1/a with error compensation (Needs validation)
-static inline float2 compensated_inv(float2 a)
-{
- float main = a.s0;
- float err = a.s1;
- return (float2)(1.0f/main, -err/(main*main));
-}
-
-// calculate a/b with error compensation (Needs validation)
-static inline float2 compensated_div(float2 a, float2 b)
-{
- float ah = a.s0;
- float al = a.s1;
- float bh = b.s0;
- float bl = b.s1;
- float bl_over_bh = bl/bh;
- return kahan_sum(kahan_sum(a, -a.s1 * bl_over_bh), -a.s0 * bl_over_bh) / bh;
-}
-
-
-// calculate a.b where a and b are float2
-static inline float2 comp_dot2(float2 a, float2 b)
-{
- return compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1));
-}
-
-// calculate a.b where a and b are float3
-static inline float2 comp_dot3(float3 a, float3 b)
-{
- return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)), comp_prod(a.s2, b.s2));
-}
-
-// calculate a.b where a and b are float4
-static inline float2 comp_dot4(float4 a, float4 b)
-{
- return compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
- compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3)));
-}
-
-// calculate a.b where a and b are float8
-static inline float2 comp_dot8(float8 a, float8 b)
-{
- return compensated_sum(
- compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
- compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))),
- compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)),
- compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7))));
-}
-
-// calculate a.b where a and b are float8
-static inline float2 comp_dot16(float16 a, float16 b)
-{
- return compensated_sum(
- compensated_sum(
- compensated_sum(compensated_sum(comp_prod(a.s0, b.s0), comp_prod(a.s1, b.s1)),
- compensated_sum(comp_prod(a.s2, b.s2), comp_prod(a.s3, b.s3))),
- compensated_sum(compensated_sum(comp_prod(a.s4, b.s4), comp_prod(a.s5, b.s5)),
- compensated_sum(comp_prod(a.s6, b.s6), comp_prod(a.s7, b.s7)))),
- compensated_sum(
- compensated_sum(compensated_sum(comp_prod(a.s8, b.s8), comp_prod(a.s9, b.s9)),
- compensated_sum(comp_prod(a.sa, b.sa), comp_prod(a.sb, b.sb))),
- compensated_sum(compensated_sum(comp_prod(a.sc, b.sc), comp_prod(a.sd, b.sd)),
- compensated_sum(comp_prod(a.se, b.se), comp_prod(a.sf, b.sf)))));
-}
diff --git a/silx/resources/opencl/linalg.cl b/silx/resources/opencl/linalg.cl
deleted file mode 100644
index 8710528..0000000
--- a/silx/resources/opencl/linalg.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (C) 2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- */
-
-/**
- *
- * Compute the spatial gradient of an image.
- *
- * slice: input image
- * slice_grad: output gradient
- * sizeX: number of columns of the image
- * sizeY: number of rows of the image
- *
- **/
-kernel void kern_gradient2D(
- global float* slice,
- global float2* slice_grad,
- int sizeX,
- int sizeY)
-{
-
- int gidx = (int) get_global_id(0);
- int gidy = (int) get_global_id(1);
-
- if ((gidx < sizeX) && (gidy < sizeY))
- {
- // Note the direction inconstancy ! (JK 07/2018)
-
- float val_y = (gidx == (sizeX-1))? 0: slice[gidy*sizeX+gidx+1] - slice[gidy*sizeX+gidx];
- float val_x = (gidy == (sizeY-1))? 0: slice[(gidy+1)*sizeX+gidx] - slice[(gidy)*sizeX+gidx];
-
- slice_grad[gidy*sizeX+gidx].x = val_x;
- slice_grad[gidy*sizeX+gidx].y = val_y;
- }
-}
-
-/**
- *
- * Compute the spatial divergence of an image gradient.
- *
- * slice_grad: input gradient-like image
- * slice: output image
- * sizeX: number of columns of the input
- * sizeY: number of rows of the input
- *
- **/
-kernel void kern_divergence2D(
- global float2* slice_grad,
- global float* slice,
- int sizeX,
- int sizeY)
-{
- int gidx = (int) get_global_id(0);
- int gidy = (int) get_global_id(1);
-
- if (gidx < sizeX && gidy < sizeY)
- {
- float val_x, val_y;
- val_y = (gidx == 0)?
- slice_grad[(gidy)*sizeX+gidx].y :
- slice_grad[(gidy)*sizeX+gidx].y - slice_grad[(gidy)*sizeX+gidx-1].y;
- val_x = (gidy == 0)?
- slice_grad[(gidy)*sizeX+gidx].x:
- slice_grad[(gidy)*sizeX+gidx].x - slice_grad[(gidy-1)*sizeX+gidx].x;
- slice[gidy*sizeX+gidx] = val_x + val_y;
- }
-}
diff --git a/silx/resources/opencl/medfilt.cl b/silx/resources/opencl/medfilt.cl
deleted file mode 100644
index 0680230..0000000
--- a/silx/resources/opencl/medfilt.cl
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Project: Azimuthal regroupping OpenCL kernel for PyFAI.
- * Median filter for 1D, 2D and 3D datasets, only 2D for now
- *
- *
- * Copyright (C) 2017-2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: J. Kieffer (kieffer@esrf.fr)
- * Last revision: 07/02/2017
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- */
-
-/*
- * Needs to be concatenated with bitonic.cl prior to compilation
-*/
-
-/*
- * Perform the 2D median filtering of an image 2D image
- *
- * dim0 => wg=number_of_element in the tile /8
- * dim1 = x: wg=1
-
- *
- * Actually the workgoup size is a bit more complicated:
- * if window size = 1,3,5,7: WG1=8
- * if window size = 9,11,13,15: WG1=32
- * if window size = 17, ...,21: WG1=64
- *
- * More Generally the workgroup size must be: at least: kfs2*(kfs1+7)/8
- * Each thread treats 8 values aligned vertically, this allows (almost)
- * coalesced reading between threads in one line of the tile.
- *
- * Later on it will be more efficient to re-use the same tile
- * and slide it vertically by one line.
- * The additionnal need for shared memory will be kfs2 floats and a float8 as register.
- *
- * Theoritically, it should be possible to handle up to windows-size 83x83
- */
-__kernel void medfilt2d(__global float *image, // input image
- __global float *result, // output array
- __local float4 *l_data,// local storage 4x the number of threads
- int khs1, // Kernel half-size along dim1 (nb lines)
- int khs2, // Kernel half-size along dim2 (nb columns)
- int height, // Image size along dim1 (nb lines)
- int width) // Image size along dim2 (nb columns)
-{
- int threadid = get_local_id(0);
- //int wg = get_local_size(0);
- int x = get_global_id(1);
-
- if (x < width)
- {
- union
- {
- float ary[8];
- float8 vec;
- } output, input;
- input.vec = (float8)(MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT);
- int kfs1 = 2 * khs1 + 1; //definition of kernel full size
- int kfs2 = 2 * khs2 + 1;
- int nbands = (kfs1 + 7) / 8; // 8 elements per thread, aligned vertically in 1 column
- for (int y=0; y<height; y++)
- {
- //Select only the active threads, some may remain inactive
- int nb_threads = (nbands * kfs2);
- int band_nr = threadid / kfs2;
- int band_id = threadid % kfs2;
- int pos_x = clamp((int)(x + band_id - khs2), (int) 0, (int) width-1);
- int max_vec = clamp(kfs1 - 8 * band_nr, 0, 8);
- if (y == 0)
- {
- for (int i=0; i<max_vec; i++)
- {
- if (threadid<nb_threads)
- {
- int pos_y = clamp((int)(y + 8 * band_nr + i - khs1), (int) 0, (int) height-1);
- input.ary[i] = image[pos_x + width * pos_y];
- }
- }
- }
- else
- {
- //store storage.s0 to some shared memory to retrieve it from another thread.
- l_data[threadid].s0 = input.vec.s0;
-
- //Offset to the bottom
- input.vec = (float8)(input.vec.s1,
- input.vec.s2,
- input.vec.s3,
- input.vec.s4,
- input.vec.s5,
- input.vec.s6,
- input.vec.s7,
- MAXFLOAT);
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int read_from = threadid + kfs2;
- if (read_from < nb_threads)
- input.vec.s7 = l_data[read_from].s0;
- else if (threadid < nb_threads) //we are on the last band
- {
- int pos_y = clamp((int)(y + 8 * band_nr + max_vec - 1 - khs1), (int) 0, (int) height-1);
- input.ary[max_vec - 1] = image[pos_x + width * pos_y];
- }
-
- }
-
- //This function is defined in bitonic.cl
- output.vec = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
- input.vec, l_data);
-
- size_t target = (kfs1 * kfs2) / 2;
- if (threadid == (target / 8))
- {
- result[y * width + x] = output.ary[target % 8];
- }
-
- }
- }
-}
-
diff --git a/silx/resources/opencl/preprocess.cl b/silx/resources/opencl/preprocess.cl
deleted file mode 100644
index de35c48..0000000
--- a/silx/resources/opencl/preprocess.cl
+++ /dev/null
@@ -1,567 +0,0 @@
-/*
- * Project: Azimuthal regroupping OpenCL kernel for PyFAI.
- * Preprocessing program
- *
- *
- * Copyright (C) 2012-2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: J. Kieffer (kieffer@esrf.fr)
- * Last revision: 19/01/2017
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-/**
- * \file
- *
- * \brief OpenCL kernels for image array casting, array mem-setting and normalizing
- *
- * Constant to be provided at build time:
- * NIMAGE: size of the image
- *
- */
-
-#include "for_eclipse.h"
-
-/**
- * \brief cast values of an array of int8 into a float output array.
- *
- * - array_s8: Pointer to global memory with the input data as signed8 array
- * - array_float: Pointer to global memory with the output data as float array
- */
-__kernel void
-s8_to_float(__global char *array_s8,
- __global float *array_float
-)
-{
- int i = get_global_id(0);
- //Global memory guard for padding
- if (i < NIMAGE)
- array_float[i] = (float)array_s8[i];
-}
-
-
-/**
- * \brief cast values of an array of uint8 into a float output array.
- *
- * - array_u8: Pointer to global memory with the input data as unsigned8 array
- * - array_float: Pointer to global memory with the output data as float array
- */
-__kernel void
-u8_to_float(__global unsigned char *array_u8,
- __global float *array_float
-)
-{
- int i = get_global_id(0);
- //Global memory guard for padding
- if (i < NIMAGE)
- array_float[i] = (float)array_u8[i];
-}
-
-
-/**
- * \brief cast values of an array of int16 into a float output array.
- *
- * - array_s16: Pointer to global memory with the input data as signed16 array
- * - array_float: Pointer to global memory with the output data as float array
- */
-__kernel void
-s16_to_float(__global short *array_s16,
- __global float *array_float
-)
-{
- int i = get_global_id(0);
- //Global memory guard for padding
- if (i < NIMAGE)
- array_float[i] = (float)array_s16[i];
-}
-
-
-/**
- * \brief cast values of an array of uint16 into a float output array.
- *
- * - array_u16: Pointer to global memory with the input data as unsigned16 array
- * - array_float: Pointer to global memory with the output data as float array
- */
-__kernel void
-u16_to_float(__global unsigned short *array_u16,
- __global float *array_float
-)
-{
- int i = get_global_id(0);
- //Global memory guard for padding
- if (i < NIMAGE)
- array_float[i] = (float)array_u16[i];
-}
-
-/**
- * \brief cast values of an array of uint32 into a float output array.
- *
- * - array_u32: Pointer to global memory with the input data as unsigned32 array
- * - array_float: Pointer to global memory with the output data as float array
- */
-__kernel void
-u32_to_float(__global unsigned int *array_u32,
- __global float *array_float
-)
-{
- int i = get_global_id(0);
- //Global memory guard for padding
- if (i < NIMAGE)
- array_float[i] = (float)array_u32[i];
-}
-
-/**
- * \brief convert values of an array of int32 into a float output array.
- *
- * - array_int: Pointer to global memory with the data as unsigned32 array
- * - array_float: Pointer to global memory with the data float array
- */
-__kernel void
-s32_to_float(__global int *array_int,
- __global float *array_float
-)
-{
- int i = get_global_id(0);
- //Global memory guard for padding
- if (i < NIMAGE)
- array_float[i] = (float)(array_int[i]);
-}
-
-
-/**
- * Functions to be called from an actual kernel.
- * \brief Performs the normalization of input image by dark subtraction,
- * variance is propagated to second member of the float3
- * flatfield, solid angle, polarization and absorption are stored in
- * third member of the float3 returned.
- *
- * Invalid/Dummy pixels will all have the third-member set to 0, i.e. no weight
- *
- * - image Float pointer to global memory storing the input image.
- * - do_dark Bool/int: shall dark-current correction be applied ?
- * - dark Float pointer to global memory storing the dark image.
- * - do_flat Bool/int: shall flat-field correction be applied ?
- * - flat Float pointer to global memory storing the flat image.
- * - do_solidangle Bool/int: shall flat-field correction be applied ?
- * - solidangle Float pointer to global memory storing the solid angle of each pixel.
- * - do_polarization Bool/int: shall polarization correction be applied ?
- * - polarization Float pointer to global memory storing the polarization of each pixel.
- * - do_absorption Bool/int: shall absorption correction be applied ?
- * - absorption Float pointer to global memory storing the effective absoption of each pixel.
- * - do_mask perform mask correction ?
- * - mask Bool/char pointer to mask array
- * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
- * - dummy Float: value for bad pixels
- * - delta_dummy Float: precision for bad pixel value
- * - normalization_factor : divide the input by this value
- *
-**/
-
-static float3 _preproc3(const __global float *image,
- const __global float *variance,
- const char do_dark,
- const __global float *dark,
- const char do_dark_variance,
- const __global float *dark_variance,
- const char do_flat,
- const __global float *flat,
- const char do_solidangle,
- const __global float *solidangle,
- const char do_polarization,
- const __global float *polarization,
- const char do_absorption,
- const __global float *absorption,
- const char do_mask,
- const __global char *mask,
- const char do_dummy,
- const float dummy,
- const float delta_dummy,
- const float normalization_factor)
-{
- size_t i= get_global_id(0);
- float3 result = (float3)(0.0, 0.0, 0.0);
- if (i < NIMAGE)
- {
- if ((!do_mask) || (!mask[i]))
- {
- result.s0 = image[i];
- if (variance != 0)
- result.s1 = variance[i];
- result.s2 = normalization_factor;
- if ( (!do_dummy)
- ||((delta_dummy != 0.0f) && (fabs(result.s0-dummy) > delta_dummy))
- ||((delta_dummy == 0.0f) && (result.s0 != dummy)))
- {
- if (do_dark)
- result.s0 -= dark[i];
- if (do_dark_variance)
- result.s1 += dark_variance[i];
- if (do_flat)
- {
- float one_flat = flat[i];
- if ( (!do_dummy)
- ||((delta_dummy != 0.0f) && (fabs(one_flat-dummy) > delta_dummy))
- ||((delta_dummy == 0.0f) && (one_flat != dummy)))
- result.s2 *= one_flat;
- else
- result.s2 = 0.0f;
- }
- if (do_solidangle)
- result.s2 *= solidangle[i];
- if (do_polarization)
- result.s2 *= polarization[i];
- if (do_absorption)
- result.s2 *= absorption[i];
- if (isnan(result.s0) || isnan(result.s1) || isnan(result.s2) || (result.s2 == 0.0f))
- result = (float3)(0.0, 0.0, 0.0);
- }
- else
- {
- result = (float3)(0.0, 0.0, 0.0);
- }//end if do_dummy
- } // end if mask
- };//end if NIMAGE
- return result;
-};//end function
-
-
-/**
- * \brief Performs the normalization of input image by dark subtraction,
- * flatfield, solid angle, polarization and absorption division.
- *
- * Intensities of images are corrected by:
- * - dark (read-out) noise subtraction
- * - Solid angle correction (division)
- * - polarization correction (division)
- * - flat fiels correction (division)
- * Corrections are made in place unless the pixel is dummy.
- * Dummy pixels are left untouched so that they remain dummy
- *
- * - image Float pointer to global memory storing the input image.
- * - do_dark Bool/int: shall dark-current correction be applied ?
- * - dark Float pointer to global memory storing the dark image.
- * - do_flat Bool/int: shall flat-field correction be applied ?
- * - flat Float pointer to global memory storing the flat image.
- * - do_solidangle Bool/int: shall flat-field correction be applied ?
- * - solidangle Float pointer to global memory storing the solid angle of each pixel.
- * - do_polarization Bool/int: shall polarization correction be applied ?
- * - polarization Float pointer to global memory storing the polarization of each pixel.
- * - do_absorption Bool/int: shall absorption correction be applied ?
- * - absorption Float pointer to global memory storing the effective absoption of each pixel.
- * - do_mask perform mask correction ?
- * - mask Bool/char pointer to mask array
- * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
- * - dummy Float: value for bad pixels
- * - delta_dummy Float: precision for bad pixel value
- * - normalization_factor : divide the input by this value
- *
-**/
-
-__kernel void
-corrections(const __global float *image,
- const char do_dark,
- const __global float *dark,
- const char do_flat,
- const __global float *flat,
- const char do_solidangle,
- const __global float *solidangle,
- const char do_polarization,
- const __global float *polarization,
- const char do_absorption,
- const __global float *absorption,
- const char do_mask,
- const __global char *mask,
- const char do_dummy,
- const float dummy,
- const float delta_dummy,
- const float normalization_factor,
- __global float *output
- )
-{
- size_t i= get_global_id(0);
- float3 result = (float3)(0.0, 0.0, 0.0);
- if (i < NIMAGE)
- {
- result = _preproc3(image,
- 0,
- do_dark,
- dark,
- 0,
- 0,
- do_flat,
- flat,
- do_solidangle,
- solidangle,
- do_polarization,
- polarization,
- do_absorption,
- absorption,
- do_mask,
- mask,
- do_dummy,
- dummy,
- delta_dummy,
- normalization_factor);
- if (result.s2 != 0.0f)
- output[i] = result.s0 / result.s2;
- else
- output[i] = dummy;
- };//end if NIMAGE
-
-};//end kernel
-
-
-/**
- * \brief Performs Normalization of input image with float2 output (num,denom)
- *
- * Intensities of images are corrected by:
- * - dark (read-out) noise subtraction for the data
- * - Solid angle correction (denominator)
- * - polarization correction (denominator)
- * - flat fiels correction (denominator)
- *
- * Corrections are made out of place.
- * Dummy pixels set both the numerator and denominator to 0
- *
- * - image Float pointer to global memory storing the input image.
- * - do_dark Bool/int: shall dark-current correction be applied ?
- * - dark Float pointer to global memory storing the dark image.
- * - do_flat Bool/int: shall flat-field correction be applied ?
- * - flat Float pointer to global memory storing the flat image.
- * - do_solidangle Bool/int: shall flat-field correction be applied ?
- * - solidangle Float pointer to global memory storing the solid angle of each pixel.
- * - do_polarization Bool/int: shall flat-field correction be applied ?
- * - polarization Float pointer to global memory storing the polarization of each pixel.
- * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
- * - dummy Float: value for bad pixels
- * - delta_dummy Float: precision for bad pixel value
- * - normalization_factor : divide the input by this value
- *
- *
-**/
-__kernel void
-corrections2(const __global float *image,
- const char do_dark,
- const __global float *dark,
- const char do_flat,
- const __global float *flat,
- const char do_solidangle,
- const __global float *solidangle,
- const char do_polarization,
- const __global float *polarization,
- const char do_absorption,
- const __global float *absorption,
- const char do_mask,
- const __global char *mask,
- const char do_dummy,
- const float dummy,
- const float delta_dummy,
- const float normalization_factor,
- __global float2 *output
- )
-{
- size_t i= get_global_id(0);
- float3 result = (float3)(0.0, 0.0, 0.0);
- if (i < NIMAGE)
- {
- result = _preproc3(image,
- 0,
- do_dark,
- dark,
- 0,
- 0,
- do_flat,
- flat,
- do_solidangle,
- solidangle,
- do_polarization,
- polarization,
- do_absorption,
- absorption,
- do_mask,
- mask,
- do_dummy,
- dummy,
- delta_dummy,
- normalization_factor);
- output[i] = (float2)(result.s0, result.s2);
- };//end if NIMAGE
-};//end kernel
-
-/**
- * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) assuming poissonian signal
- *
- * Intensities of images are corrected by:
- * - dark (read-out) noise subtraction for the data
- * - Solid angle correction (denominator)
- * - polarization correction (denominator)
- * - flat fiels correction (denominator)
- *
- * Corrections are made out of place.
- * Dummy pixels set both the numerator and denominator to 0
- *
- * - image Float pointer to global memory storing the input image.
- * - do_dark Bool/int: shall dark-current correction be applied ?
- * - dark Float pointer to global memory storing the dark image.
- * - do_flat Bool/int: shall flat-field correction be applied ?
- * - flat Float pointer to global memory storing the flat image.
- * - do_solidangle Bool/int: shall flat-field correction be applied ?
- * - solidangle Float pointer to global memory storing the solid angle of each pixel.
- * - do_polarization Bool/int: shall flat-field correction be applied ?
- * - polarization Float pointer to global memory storing the polarization of each pixel.
- * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
- * - dummy Float: value for bad pixels
- * - delta_dummy Float: precision for bad pixel value
- * - normalization_factor : divide the input by this value
- *
- *
-**/
-__kernel void
-corrections3Poisson( const __global float *image,
- const char do_dark,
- const __global float *dark,
- const char do_flat,
- const __global float *flat,
- const char do_solidangle,
- const __global float *solidangle,
- const char do_polarization,
- const __global float *polarization,
- const char do_absorption,
- const __global float *absorption,
- const char do_mask,
- const __global char *mask,
- const char do_dummy,
- const float dummy,
- const float delta_dummy,
- const float normalization_factor,
- __global float3 *output
- )
-{
- size_t i= get_global_id(0);
- float3 result = (float3)(0.0, 0.0, 0.0);
- if (i < NIMAGE)
- {
- result = _preproc3(image,
- image,
- do_dark,
- dark,
- do_dark,
- dark,
- do_flat,
- flat,
- do_solidangle,
- solidangle,
- do_polarization,
- polarization,
- do_absorption,
- absorption,
- do_mask,
- mask,
- do_dummy,
- dummy,
- delta_dummy,
- normalization_factor);
- output[i] = result;
- };//end if NIMAGE
-};//end kernel
-
-
-/**
- * \brief Performs Normalization of input image with float3 output (signal, variance, normalization)
- *
- * Intensities of images are corrected by:
- * - dark (read-out) noise subtraction for the data
- * - Solid angle correction (division)
- * - polarization correction (division)
- * - flat fiels correction (division)
- * Corrections are made in place unless the pixel is dummy.
- * Dummy pixels are left untouched so that they remain dummy
- *
- * - image Float pointer to global memory storing the input image.
- * - do_dark Bool/int: shall dark-current correction be applied ?
- * - dark Float pointer to global memory storing the dark image.
- * - do_flat Bool/int: shall flat-field correction be applied ?
- * - flat Float pointer to global memory storing the flat image.
- * - do_solidangle Bool/int: shall flat-field correction be applied ?
- * - solidangle Float pointer to global memory storing the solid angle of each pixel.
- * - do_polarization Bool/int: shall flat-field correction be applied ?
- * - polarization Float pointer to global memory storing the polarization of each pixel.
- * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
- * - dummy Float: value for bad pixels
- * - delta_dummy Float: precision for bad pixel value
- * - normalization_factor : divide the input by this value
- *
- *
-**/
-
-__kernel void
-corrections3(const __global float *image,
- const __global float *variance,
- const char do_dark,
- const __global float *dark,
- const char do_dark_variance,
- const __global float *dark_variance,
- const char do_flat,
- const __global float *flat,
- const char do_solidangle,
- const __global float *solidangle,
- const char do_polarization,
- const __global float *polarization,
- const char do_absorption,
- const __global float *absorption,
- const char do_mask,
- const __global char *mask,
- const char do_dummy,
- const float dummy,
- const float delta_dummy,
- const float normalization_factor,
- __global float3 *output
- )
-{
- size_t i= get_global_id(0);
- float3 result = (float3)(0.0, 0.0, 0.0);
- if (i < NIMAGE)
- {
- result = _preproc3( image,
- variance,
- do_dark,
- dark,
- do_dark_variance,
- dark_variance,
- do_flat,
- flat,
- do_solidangle,
- solidangle,
- do_polarization,
- polarization,
- do_absorption,
- absorption,
- do_mask,
- mask,
- do_dummy,
- dummy,
- delta_dummy,
- normalization_factor);
- output[i] = result;
- };//end if NIMAGE
-};//end kernel
-
-
diff --git a/silx/resources/opencl/proj.cl b/silx/resources/opencl/proj.cl
deleted file mode 100644
index 2a6d870..0000000
--- a/silx/resources/opencl/proj.cl
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * Copyright (C) 2017-2017 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Based on the projector of PyHST2 - https://forge.epn-campus.eu/projects/pyhst2
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- */
-
-/*******************************************************************************/
-/************************ GPU VERSION (with textures) **************************/
-/*******************************************************************************/
-
-#ifndef DONT_USE_TEXTURES
-kernel void forward_kernel(
- global float *d_Sino,
- read_only image2d_t d_slice,
- int dimslice,
- int num_bins,
- global float* angles_per_project ,
- float axis_position,
- global float *d_axis_corrections,
- global int* d_beginPos ,
- global int* d_strideJoseph,
- global int* d_strideLine ,
- int num_projections,
- int dimrecx,
- int dimrecy,
- float cpu_offset_x,
- float cpu_offset_y,
- int josephnoclip,
- int normalize)
-{
- const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
- const int tidx = get_local_id(0);
- const int bidx = get_group_id(0);
- const int tidy = get_local_id(1);
- const int bidy = get_group_id(1);
- float angle;
- float cos_angle,sin_angle ;
-
- local float corrections[16];
- local int beginPos[16*2];
- local int strideJoseph[16*2];
- local int strideLine[16*2];
-
- // thread will use corrections[tidy]
- // All are read by first warp
- int offset, OFFSET;
- switch(tidy) {
- case 0:
- corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx];
- break;
- case 1:
- case 2:
- offset = 16*(tidy-1);
- OFFSET = dimrecy*(tidy-1);
- beginPos [offset + tidx ]= d_beginPos[ OFFSET+ bidy*16+tidx] ;
- break;
- case 3:
- case 4:
- offset = 16*(tidy-3);
- OFFSET = dimrecy*(tidy-3);
- strideJoseph[offset + tidx ]= d_strideJoseph[OFFSET + bidy*16+tidx] ;
- break;
- case 5:
- case 6:
- offset = 16*(tidy-5);
- OFFSET = dimrecy*(tidy-5);
- strideLine[offset + tidx ]= d_strideLine[OFFSET + bidy*16+tidx] ;
- break;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- angle = angles_per_project[ bidy*16+tidy ] ;
- cos_angle = cos(angle);
- sin_angle = sin(angle);
-
- if(fabs(cos_angle) > 0.70710678f ) {
- if( cos_angle>0) {
- cos_angle = cos(angle);
- sin_angle = sin(angle);
- }
- else {
- cos_angle = -cos(angle);
- sin_angle = -sin(angle);
- }
- }
- else {
- if( sin_angle>0) {
- cos_angle = sin(angle);
- sin_angle = -cos(angle);
- }
- else {
- cos_angle = -sin(angle);
- sin_angle = cos(angle);
- }
- }
- float res=0.0f;
- float axis_corr = axis_position + corrections[ tidy ];
- float axis = axis_position ;
- float xpix = ( bidx*16+tidx )-cpu_offset_x;
- float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ;
-
- float shiftJ = sin_angle/cos_angle;
- float x1 = min(-sin_angle/cos_angle ,0.f);
- float x2 = max(-sin_angle/cos_angle ,0.f);
-
- float Area;
- Area=1.0f/cos_angle;
- int stlA, stlB , stlAJ, stlBJ ;
- stlA=strideLine[16+tidy];
- stlB=strideLine[tidy];
- stlAJ=strideJoseph[16+tidy];
- stlBJ=strideJoseph[tidy];
-
- int beginA = beginPos[16+tidy ];
- int beginB = beginPos[tidy ];
- float add;
- int l;
-
- if(josephnoclip) {
- for(int j=0; j<dimslice; j++) { // j: Joseph
- x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
- x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
- add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2);
- res += add;
- posx += shiftJ;
- }
- }
- else {
- for(int j=0; j<dimslice; j++) { // j: Joseph
- x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
- x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
- l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ;
- add = read_imagef(d_slice, sampler, (float2) (x1, x2)).x; // add = tex2D(texSlice, x1,x2);
- res += add*l;
- posx += shiftJ;
- }
- }
-
- if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) {
- res *= Area;
- if (normalize)
- res *= M_PI_F * 0.5f / num_projections;
- d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res;
- }
-}
-#endif
-
-
-/*******************************************************************************/
-/********************* CPU VERSION (without textures) **************************/
-/*******************************************************************************/
-
-
-kernel void forward_kernel_cpu(
- global float *d_Sino,
- global float* d_slice,
- int dimslice,
- int num_bins,
- global float* angles_per_project ,
- float axis_position,
- global float *d_axis_corrections,
- global int* d_beginPos ,
- global int* d_strideJoseph,
- global int* d_strideLine ,
- int num_projections,
- int dimrecx,
- int dimrecy,
- float cpu_offset_x,
- float cpu_offset_y,
- int josephnoclip,
- int normalize)
-{
-
- const int tidx = get_local_id(0);
- const int bidx = get_group_id(0);
- const int tidy = get_local_id(1);
- const int bidy = get_group_id(1);
- float angle;
- float cos_angle,sin_angle ;
-
- local float corrections[16];
- local int beginPos[16*2];
- local int strideJoseph[16*2];
- local int strideLine[16*2];
-
- // thread will use corrections[tidy]
- // All are read by first warp
- int offset, OFFSET;
- switch(tidy) {
- case 0:
- corrections[ tidx ]= d_axis_corrections[ bidy*16+tidx];
- break;
- case 1:
- case 2:
- offset = 16*(tidy-1);
- OFFSET = dimrecy*(tidy-1);
- beginPos [offset + tidx ]= d_beginPos[ OFFSET+ bidy*16+tidx] ;
- break;
- case 3:
- case 4:
- offset = 16*(tidy-3);
- OFFSET = dimrecy*(tidy-3);
- strideJoseph[offset + tidx ]= d_strideJoseph[OFFSET + bidy*16+tidx] ;
- break;
- case 5:
- case 6:
- offset = 16*(tidy-5);
- OFFSET = dimrecy*(tidy-5);
- strideLine[offset + tidx ]= d_strideLine[OFFSET + bidy*16+tidx] ;
- break;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- angle = angles_per_project[ bidy*16+tidy ] ;
- cos_angle = cos(angle);
- sin_angle = sin(angle);
-
- if(fabs(cos_angle) > 0.70710678f ) {
- if( cos_angle>0) {
- cos_angle = cos(angle);
- sin_angle = sin(angle);
- }
- else {
- cos_angle = -cos(angle);
- sin_angle = -sin(angle);
- }
- }
- else {
- if( sin_angle>0) {
- cos_angle = sin(angle);
- sin_angle = -cos(angle);
- }
- else {
- cos_angle = -sin(angle);
- sin_angle = cos(angle);
- }
- }
- float res=0.0f;
- float axis_corr = axis_position + corrections[ tidy ];
- float axis = axis_position ;
- float xpix = ( bidx*16+tidx )-cpu_offset_x;
- float posx = axis*(1.0f-sin_angle/cos_angle ) +(xpix-(axis_corr) )/cos_angle ;
-
- float shiftJ = sin_angle/cos_angle;
- float x1 = min(-sin_angle/cos_angle ,0.f);
- float x2 = max(-sin_angle/cos_angle ,0.f);
-
- float Area;
- Area=1.0f/cos_angle;
- int stlA, stlB , stlAJ, stlBJ ;
- stlA=strideLine[16+tidy];
- stlB=strideLine[tidy];
- stlAJ=strideJoseph[16+tidy];
- stlBJ=strideJoseph[tidy];
-
- int beginA = beginPos[16+tidy ];
- int beginB = beginPos[tidy ];
- int l;
-
- int ym, yp, xm, xp;
- float yc, xc;
- float val;
- if(josephnoclip) {
- for(int j=0; j<dimslice; j++) { // j: Joseph
- x1 = beginA +(posx)*stlA + (j)*stlAJ+1.0f;
- x2 = beginB +(posx)*stlB + (j)*stlBJ+1.0f;
- /*
- Bilinear interpolation
- */
- yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped
- ym = (int) floor(yc); // y_minus
- yp = (int) ceil(yc); // y_plus
-
- xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f)); // x_clipped
- xm = (int) floor(xc); // x_minus
- xp = (int) ceil(xc); // x_plus
-
- if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm];
- else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm));
- else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym));
- else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc))
- + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc))
- + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm))
- + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm));
- // ----------
- res += val;
- posx += shiftJ;
- }
- }
- else {
- for(int j=0; j<dimslice; j++) { // j: Joseph
- x1 = beginA +(posx)*stlA + (j)*stlAJ+1.5f;
- x2 = beginB +(posx)*stlB + (j)*stlBJ+1.5f;
- l = (x1>=0.0f )*(x1<(dimslice+2))*( x2>=0.0f)*( x2<(dimslice+2) ) ;
- /*
- Bilinear interpolation
- */
- yc = fmin(fmax(x2, 0.0f), ((dimslice+2) - 1.0f)); // y_clipped
- ym = (int) floor(yc); // y_minus
- yp = (int) ceil(yc); // y_plus
-
- xc = fmin(fmax(x1, 0.0f), ((dimslice+2) - 1.0f)); // x_clipped
- xm = (int) floor(xc); // x_minus
- xp = (int) ceil(xc); // x_plus
-
- if ((ym == yp) && (xm == xp)) val = d_slice[ym*(dimslice+2) + xm];
- else if (ym == yp) val = (d_slice[ym*(dimslice+2) + xm] * (xp - xc)) + (d_slice[ym*(dimslice+2) + xp] * (xc - xm));
- else if (xm == xp) val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc)) + (d_slice[yp*(dimslice+2) + xm] * (yc - ym));
- else val = (d_slice[ym*(dimslice+2) + xm] * (yp - yc) * (xp - xc))
- + (d_slice[yp*(dimslice+2) + xm] * (yc - ym) * (xp - xc))
- + (d_slice[ym*(dimslice+2) + xp] * (yp - yc) * (xc - xm))
- + (d_slice[yp*(dimslice+2) + xp] * (yc - ym) * (xc - xm));
- // ----------
- res += val*l;
- posx += shiftJ;
- }
- }
-
- if((bidy*16 + tidy) < num_projections && (bidx*16 + tidx) < num_bins) {
- res *= Area;
- if (normalize)
- res *= M_PI_F * 0.5f / num_projections;
- d_Sino[dimrecx*(bidy*16 + tidy) + (bidx*16 + tidx)] = res;
- }
-}
diff --git a/silx/resources/opencl/sparse.cl b/silx/resources/opencl/sparse.cl
deleted file mode 100644
index c99a0e9..0000000
--- a/silx/resources/opencl/sparse.cl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (C) 2016-2019 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef IMAGE_WIDTH
- #error "Please define IMAGE_WIDTH parameter"
-#endif
-
-#ifndef DTYPE
- #error "Please define DTYPE parameter"
-#endif
-
-#ifndef IDX_DTYPE
- #error "Please define IDX_DTYPE parameter"
-#endif
-
-
-
-/**
- * Densify a matric from CSR format to "dense" 2D format.
- * The input CSR data consists in 3 arrays: (data, ind, iptr).
- * The output array is a 2D array of dimensions IMAGE_WIDTH * image_height.
- *
- * data: 1D array containing the nonzero data items.
- * ind: 1D array containing the column indices of the nonzero data items.
- * iptr: 1D array containing indirection indices, such that range
- * [iptr[i], iptr[i+1]-1] of "data" and "ind" contain the relevant data
- * of output row "i".
- * output: 2D array containing the densified data.
- * image_height: height (number of rows) of the output data.
-**/
-
-kernel void densify_csr(
- const global DTYPE* data,
- const global IDX_DTYPE* ind,
- const global IDX_DTYPE* iptr,
- global DTYPE* output,
- int image_height
-)
-{
- uint tid = get_local_id(0);
- uint row_idx = get_global_id(1);
- if ((tid >= IMAGE_WIDTH) || (row_idx >= image_height)) return;
-
- local DTYPE line[IMAGE_WIDTH];
-
- // Memset
- //~ #pragma unroll
- for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) {
- if (tid+k >= IMAGE_WIDTH) break;
- line[tid+k] = 0.0f;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
-
- uint start = iptr[row_idx], end = iptr[row_idx+1];
- //~ #pragma unroll
- for (int k = start; k < end; k += get_local_size(0)) {
- // Current work group handles one line of the final array
- // on the current line, write data[start+tid] at column index ind[start+tid]
- if (k+tid < end)
- line[ind[k+tid]] = data[k+tid];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // write the current line (shared mem) into the output array (global mem)
- //~ #pragma unroll
- for (int k = 0; tid+k < IMAGE_WIDTH; k += get_local_size(0)) {
- output[row_idx*IMAGE_WIDTH + tid+k] = line[tid+k];
- if (k+tid >= IMAGE_WIDTH) return;
- }
-}
diff --git a/silx/resources/opencl/statistics.cl b/silx/resources/opencl/statistics.cl
deleted file mode 100644
index 47d925b..0000000
--- a/silx/resources/opencl/statistics.cl
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Project: Silx statics calculation
- *
- *
- *
- * Copyright (C) 2012-2021 European Synchrotron Radiation Facility
- * Grenoble, France
- *
- * Principal authors: J. Kieffer (kieffer@esrf.fr)
- * Last revision: 17/05/2021
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-/**
- * \file
- *
- * \brief OpenCL kernels for min, max, mean and std calculation
- *
- * This module provides two functions to perform the `map` and the `reduce`
- * to be used with pyopencl reduction to calculate in a single pass the minimum,
- * maximum, sum, count, mean and standart deviation for an array.
- *
- * So beside the reduction mechanisme from pyopencl, this algorithm implementes equations from
- * https://dbs.ifi.uni-heidelberg.de/files/Team/eschubert/publications/SSDBM18-covariance-authorcopy.pdf
- *
- * let A and B be 2 disjoint partition of all elements
- *
- * Omega_A = sum_{i \in A}(omaga_i) The sum of all weights
- * V_A is the weighted sum of the signal over the partition
- * VV_A is the weighted sum of deviation squarred
- *
- * With this the mean is V / Omega and the variance equals VV / omega.
- *
- * Redction operator performs:
- * Omega_{AB} = Omega_A + Omega_B
- * V_{AB} = V_A + V_B
- * VV{AB} = VV_A + VV_B + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB})
- *
- * To avoid any numerical degradation, the doubleword library is used to perform all floating point operations.
- *
- */
-#include "for_eclipse.h"
-
-/* \brief read a value at given position and initialize the float8 for the reduction
- *
- * The float8 returned contains:
- * s0: minimum value
- * s1: maximum value
- * s2: Omega_h count number of valid pixels
- * s3: Omega_l error associated to the count
- * s4: V_h sum of signal
- * s5: V_l error associated to the sum of signal
- * s6: VVh variance*count
- * s7: VVl error associated to variance*count
- *
- */
-static inline float8 map_statistics(global float* data, int position)
-{
- float value = data[position];
- float8 result;
-
- if (isfinite(value))
- {
- result = (float8)(value, value, 1.0f, 0.0f, value, 0.0f, 0.0f, 0.0f);
- // min max cnt_h cnt_l sum_h sum_l M2_h M2_l
- }
- else
- {
- result = (float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
- // min max cnt_h cnt_l sum_h sum_l M2_h M2_l
- }
- return result;
-}
-
-/* \brief reduction function associated to the statistics.
- *
- * this is described in:
- * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
- *
- * The float8 used here contain contains:
- * s0: minimum value
- * s1: maximum value
- * s2: count number of valid pixels
- * s3: count (error associated to)
- * s4: sum of valid pixels
- * s5: sum (error associated to)
- * s6: M=variance*(count-1)
- * s7: M=variance*(count-1) (error associated to)
- *
- */
-
-static inline float8 reduce_statistics(float8 a, float8 b)
-{
- float2 sum_a, sum_b, M_a, M_b, count_a, count_b;
- float2 count, sum;
- float2 M, delta, delta2, omega3;
- //test on count
- if (a.s2 == 0.0f)
- {
- return b;
- }
- else
- {
- count_a = (float2)(a.s2, a.s3);
- sum_a = (float2)(a.s4, a.s5);
- M_a = (float2)(a.s6, a.s7);
- }
- //test on count
- if (b.s2 == 0.0f)
- {
- return a;
- }
- else
- {
- count_b = (float2)(b.s2, b.s3);
- sum_b = (float2)(b.s4, b.s5);
- M_b = (float2)(b.s6, b.s7);
- }
- // count = count_a + count_b
- count = dw_plus_dw(count_a, count_b);
- // sum = sum_a + sum_b
- sum = dw_plus_dw(sum_a, sum_b);
-
- // M2 = M_a + M_b + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB})
- M = dw_plus_dw(M_a, M_b);
- delta = dw_plus_dw(dw_times_dw(count_b, sum_a),
- -dw_times_dw(count_a, sum_b));
- delta2 = dw_times_dw(delta, delta);
- omega3 = dw_times_dw(count, dw_times_dw(count_a, count_b));
- M = dw_plus_dw(M, dw_div_dw(delta2, omega3));
-
- float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1),
- count.s0, count.s1,
- sum.s0, sum.s1,
- M.s0, M.s1);
- return result;
-}
-
-/* \brief reduction function associated to the statistics without compensated arithmetics.
- *
- * this is described in:
- * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
- *
- * The float8 used here contain contains:
- * s0: minimum value
- * s1: maximum value
- * s2: count number of valid pixels (major)
- * s3: count number of valid pixels (minor)
- * s4: sum of valid pixels (major)
- * s5: sum of valid pixels (minor)
- * s6: variance*count (major)
- * s7: variance*count (minor)
- *
- */
-
-static inline float8 reduce_statistics_simple(float8 a, float8 b)
-{
- float sum_a, sum_b, M_a, M_b, count_a, count_b;
-
- //test on count
- if (a.s2 == 0.0f)
- {
- return b;
- }
- else
- {
- count_a = a.s2;
- sum_a = a.s4;
- M_a = a.s6;
- }
- //test on count
- if (b.s2 == 0.0f)
- {
- return a;
- }
- else
- {
- count_b = b.s2;
- sum_b = b.s4;
- M_b = b.s6;
- }
- float count = count_a + count_b;
- float sum = sum_a + sum_b;
- float delta = sum_a*count_b - sum_b*count_a;
- float delta2 = delta * delta;
- float M2 = M_a + M_b + delta2/(count*count_a*count_b);
- float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1),
- count, 0.0f,
- sum, 0.0f,
- M2, 0.0f);
- return result;
-}
-
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-
-// unpack a double in two floats such as the sum of the two is the double number
-static inline float2 unpack_double(double inp){
- float major = (float) inp;
- float minor = (float) (inp - major);
- return (float2)(major, minor);
-}
-
-// pack two floats into a double
-static inline double pack_double(float major, float minor){
- return (double)major + (double)minor;
-}
-
-/* \brief reduction function associated to the statistics using double precision arithmetics
- *
- * this is described in:
- * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
- *
- * The float8 used here contain contains:
- * s0: minimum value
- * s1: maximum value
- * s2: count number of valid pixels (major)
- * s3: count number of valid pixels (minor)
- * s4: sum of valid pixels (major)
- * s5: sum of valid pixels (minor)
- * s6: variance*count (major)
- * s7: variance*count (minor)
- *
- */
-
-static inline float8 reduce_statistics_double(float8 a, float8 b)
-{
- double sum_a, sum_b, M_a, M_b, count_a, count_b;
-
- //test on count
- if (a.s2 == 0.0)
- {
- return b;
- }
- else
- {
- count_a = pack_double(a.s2, a.s3);
- sum_a = pack_double(a.s4,a.s5);
- M_a = pack_double(a.s6, a.s7);
- }
- //test on count
- if (b.s2 == 0.0)
- {
- return a;
- }
- else
- {
- count_b = pack_double(b.s2, b.s3);
- sum_b = pack_double(b.s4, b.s5);
- M_b = pack_double(b.s6, b.s7);
- }
- double count = count_a + count_b;
- double sum = sum_a + sum_b;
- double delta = sum_a*count_b - sum_b*count_a;
- double delta2 = delta * delta;
- double M2 = M_a + M_b + delta2/(count*count_a*count_b);
- float8 result = (float8)((float2)(min(a.s0, b.s0), max(a.s1, b.s1)),
- unpack_double(count),
- unpack_double( sum),
- unpack_double( M2));
- return result;
-}
-
-#endif \ No newline at end of file