diff options
author | Picca Frédéric-Emmanuel <picca@synchrotron-soleil.fr> | 2017-08-18 14:48:52 +0200 |
---|---|---|
committer | Picca Frédéric-Emmanuel <picca@synchrotron-soleil.fr> | 2017-08-18 14:48:52 +0200 |
commit | f7bdc2acff3c13a6d632c28c4569690ab106eed7 (patch) | |
tree | 9d67cdb7152ee4e711379e03fe0546c7c3b97303 /silx/resources/opencl |
Import Upstream version 0.5.0+dfsg
Diffstat (limited to 'silx/resources/opencl')
-rw-r--r-- | silx/resources/opencl/addition.cl | 36 | ||||
-rw-r--r-- | silx/resources/opencl/bitonic.cl | 557 | ||||
-rw-r--r-- | silx/resources/opencl/medfilt.cl | 141 | ||||
-rw-r--r-- | silx/resources/opencl/preprocess.cl | 567 |
4 files changed, 1301 insertions, 0 deletions
diff --git a/silx/resources/opencl/addition.cl b/silx/resources/opencl/addition.cl new file mode 100644 index 0000000..8ecfd4e --- /dev/null +++ b/silx/resources/opencl/addition.cl @@ -0,0 +1,36 @@ +/* + * Project: SIFT: An algorithm for image alignement + * + * Copyright (C) 2013-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +__kernel void addition(__global float* a, __global float* b, __global float* res, int N) +{ + unsigned int i = get_global_id(0); + if( i<N ){ + res[i] = a[i] + b[i]; + } +}
\ No newline at end of file diff --git a/silx/resources/opencl/bitonic.cl b/silx/resources/opencl/bitonic.cl new file mode 100644 index 0000000..f1cb55c --- /dev/null +++ b/silx/resources/opencl/bitonic.cl @@ -0,0 +1,557 @@ +/*############################################################################ +# Sort elements within a vector by Matthew Scarpino, +# Taken from his book "OpenCL in Action", +# November 2011 ISBN 9781617290176 +# Original license for the code: "public domain" +# +# Originally this code is public domain. The MIT license has been added +# by J. Kieffer (jerome.kieffer@esrf.eu) to provide a disclaimer. +# J. Kieffer does not claim authorship of this code developed by . +# +# Copyright (c) 2011 Matthew Scarpino +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +#############################################################################*/ + +#define VECTOR_SORT_BOOK(input, dir) \ + comp = abs(input > shuffle(input, mask2)) ^ dir; \ + input = shuffle(input, comp * 2 + add2); \ + comp = abs(input > shuffle(input, mask1)) ^ dir; \ + input = shuffle(input, comp + add1); \ + + +#define VECTOR_SWAP_BOOK(in1, in2, dir) \ + input1 = in1; input2 = in2; \ + comp = (abs(input1 > input2) ^ dir) * 4 + add3; \ + in1 = shuffle2(input1, input2, comp); \ + in2 = shuffle2(input2, input1, comp); \ + + +// The _FILE extension correspond to the formula found in the "OpenCL in Action" supplementary files +#define VECTOR_SORT_FILE(input, dir) \ + comp = (input < shuffle(input, mask2)) ^ dir; \ + input = shuffle(input, as_uint4(comp * 2 + add2)); \ + comp = (input < shuffle(input, mask1)) ^ dir; \ + input = shuffle(input, as_uint4(comp + add1)); \ + + +#define VECTOR_SWAP_FILE(input1, input2, dir) \ + temp = input1; \ + comp = ((input1 < input2) ^ dir) * 4 + add3; \ + input1 = shuffle2(input1, input2, as_uint4(comp)); \ + input2 = shuffle2(input2, temp, as_uint4(comp)); \ + + + +// Functions to be called from an actual kernel. + +static float8 my_sort_file(uint local_id, uint group_id, uint local_size, + float8 input, __local float4 *l_data){ + float4 input1, input2, temp; + float8 output; + + int dir; + uint id, size, stride; + int4 comp; + + uint4 mask1 = (uint4)(1, 0, 3, 2); + uint4 mask2 = (uint4)(2, 3, 0, 1); + uint4 mask3 = (uint4)(3, 2, 1, 0); + + int4 add1 = (int4)(1, 1, 3, 3); + int4 add2 = (int4)(2, 3, 2, 3); + int4 add3 = (int4)(1, 2, 2, 3); + + // retrieve input data + input1 = (float4)(input.s0, input.s1, input.s2, input.s3); + input2 = (float4)(input.s4, input.s5, input.s6, input.s7); + + // Find global address + id = local_id * 2; + + /* Sort input 1 - ascending */ + comp = input1 < shuffle(input1, mask1); + input1 = shuffle(input1, as_uint4(comp + add1)); + comp = input1 < shuffle(input1, mask2); + input1 = shuffle(input1, as_uint4(comp * 2 + add2)); + comp = input1 < shuffle(input1, mask3); + input1 = shuffle(input1, as_uint4(comp + add3)); + + /* Sort input 2 - descending */ + comp = input2 > shuffle(input2, mask1); + input2 = shuffle(input2, as_uint4(comp + add1)); + comp = input2 > shuffle(input2, mask2); + input2 = shuffle(input2, as_uint4(comp * 2 + add2)); + comp = input2 > shuffle(input2, mask3); + input2 = shuffle(input2, as_uint4(comp + add3)); + + /* Swap corresponding elements of input 1 and 2 */ + add3 = (int4)(4, 5, 6, 7); + dir = - (int) (local_id % 2); + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + + /* Sort data and store in local memory */ + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + + /* Create bitonic set */ + for(size = 2; size < local_size; size <<= 1) { + dir = - (int) (local_id/size & 1) ; + + for(stride = size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + } + + /* Perform bitonic merge */ + dir = - (int) (group_id % 2); + for(stride = local_size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Perform final sort */ + id = local_id * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + + // setup output and return it + output = (float8)(input1, input2); + return output; +} + +static float8 my_sort_book(uint local_id, uint group_id, uint local_size, + float8 input, __local float4 *l_data){ + float4 input1, input2, temp; + float8 output; + uint4 comp, swap, mask1, mask2, add1, add2, add3; + uint id, dir, size, stride; + mask1 = (uint4)(1, 0, 3, 2); + swap = (uint4)(0, 0, 1, 1); + add1 = (uint4)(0, 0, 2, 2); + mask2 = (uint4)(2, 3, 0, 1); + add2 = (uint4)(0, 1, 0, 1); + add3 = (uint4)(0, 1, 2, 3); + + // retrieve input data + input1 = (float4)(input.s0, input.s1, input.s2, input.s3); + input2 = (float4)(input.s4, input.s5, input.s6, input.s7); + + // Find global address + id = local_id * 2; + + //Sort first vector + + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp ^ swap + add1); + comp = abs(input1 > shuffle(input1, mask2)); + input1 = shuffle(input1, comp * 2 + add2); + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp + add1); + + //Sort second vector + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp ^ swap + add1); + comp = abs(input2 < shuffle(input2, mask2)); + input2 = shuffle(input2, comp * 2 + add2); + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp + add1); + + // Swap elements + dir = local_id % 2; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + + // Perform upper stages + for(size = 2; size < local_size; size <<= 1) { + dir = local_id/size & 1; + + //Perform lower stages + for(stride = size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = local_id * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + } + dir = group_id % 2; + + // Perform bitonic merge + for(stride = local_size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = local_id * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + + // setup output and return it + output = (float8)(input1, input2); + return output; +} + + + +////////////// +// Kernels +////////////// + +// Perform the sort on the whole array +// dim0: wg=number_of_element/8 + +__kernel void bsort_all(__global float4 *g_data, + __local float4 *l_data) { + float4 input1, input2; + float8 input, output; + uint id, global_start; + // Find global address + id = get_local_id(0) * 2; + global_start = get_group_id(0) * get_local_size(0) * 2 + id; + + input1 = g_data[global_start]; + input2 = g_data[global_start+1]; + input = (float8)(input1, input2); + output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), + input, l_data); + input1 = (float4) (output.s0, output.s1, output.s2, output.s3); + input2 = (float4) (output.s4, output.s5, output.s6, output.s7); + g_data[global_start] = input1; + g_data[global_start+1] = input2; +} + + +// Perform the sort along the horizontal axis of a 2D image +// dim0 = y: wg=1 +// dim1 = x: wg=number_of_element/8 +__kernel void bsort_horizontal(__global float *g_data, + __local float4 *l_data) { + float8 input, output; + uint id, global_start, offset; + + // Find global address + offset = get_global_size(1)*get_global_id(0)*8; + id = get_local_id(1) * 8; + global_start = offset + get_group_id(1) * get_local_size(1) * 8 + id; + + input = (float8)(g_data[global_start ], + g_data[global_start + 1], + g_data[global_start + 2], + g_data[global_start + 3], + g_data[global_start + 4], + g_data[global_start + 5], + g_data[global_start + 6], + g_data[global_start + 7]); + + output = my_sort_file(get_local_id(1), get_group_id(1), get_local_size(1), + input, l_data); + + g_data[global_start ] = output.s0; + g_data[global_start + 1] = output.s1; + g_data[global_start + 2] = output.s2; + g_data[global_start + 3] = output.s3; + g_data[global_start + 4] = output.s4; + g_data[global_start + 5] = output.s5; + g_data[global_start + 6] = output.s6; + g_data[global_start + 7] = output.s7; +} + + +// Perform the sort along the vertical axis +// dim0 = y: wg=number_of_element/8 +// dim1 = x: wg=1 +// check if transposing +bsort_horizontal is not more efficient ? + +__kernel void bsort_vertical(__global float *g_data, + __local float4 *l_data) { + // we need to read 8 float position along the vertical axis + float8 input, output; + uint id, global_start, padding; + + // Find global address + padding = get_global_size(1); + id = get_local_id(0) * 8 * padding + get_global_id(1); + global_start = get_group_id(0) * get_local_size(0) * 8 * padding + id; + + input = (float8)(g_data[global_start ], + g_data[global_start + padding ], + g_data[global_start + 2*padding], + g_data[global_start + 3*padding], + g_data[global_start + 4*padding], + g_data[global_start + 5*padding], + g_data[global_start + 6*padding], + g_data[global_start + 7*padding]); + + output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), + input, l_data); + g_data[global_start ] = output.s0; + g_data[global_start + padding ] = output.s1; + g_data[global_start + 2*padding ] = output.s2; + g_data[global_start + 3*padding ] = output.s3; + g_data[global_start + 4*padding ] = output.s4; + g_data[global_start + 5*padding ] = output.s5; + g_data[global_start + 6*padding ] = output.s6; + g_data[global_start + 7*padding ] = output.s7; +} + + +//Tested working reference kernel frm the book. This only works under Linux +__kernel void bsort_book(__global float4 *g_data, + __local float4 *l_data) { + float4 input1, input2, temp; + uint4 comp, swap, mask1, mask2, add1, add2, add3; + uint id, dir, global_start, size, stride; + mask1 = (uint4)(1, 0, 3, 2); + swap = (uint4)(0, 0, 1, 1); + add1 = (uint4)(0, 0, 2, 2); + mask2 = (uint4)(2, 3, 0, 1); + add2 = (uint4)(0, 1, 0, 1); + add3 = (uint4)(0, 1, 2, 3); + + // Find global address + id = get_local_id(0) * 2; + global_start = get_group_id(0) * get_local_size(0) * 2 + id; + + //Sort first vector + input1 = g_data[global_start]; + input2 = g_data[global_start+1]; + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp ^ swap + add1); + comp = abs(input1 > shuffle(input1, mask2)); + input1 = shuffle(input1, comp * 2 + add2); + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp + add1); + + //Sort second vector + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp ^ swap + add1); + comp = abs(input2 < shuffle(input2, mask2)); + input2 = shuffle(input2, comp * 2 + add2); + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp + add1); + + // Swap elements + dir = get_local_id(0) % 2; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + + // Perform upper stages + for(size = 2; size < get_local_size(0); size <<= 1) { + dir = get_local_id(0)/size & 1; + + //Perform lower stages + for(stride = size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = get_local_id(0) * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + } + dir = get_group_id(0) % 2; + // Perform bitonic merge + for(stride = get_local_size(0); stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = get_local_id(0) * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + g_data[global_start] = input1; + g_data[global_start+1] = input2; + } + +//Tested working reference kernel from the addition files. This only works under any operating system +/* Perform initial sort */ +__kernel void bsort_file(__global float4 *g_data, __local float4 *l_data) { + + int dir; + uint id, global_start, size, stride; + float4 input1, input2, temp; + int4 comp; + + uint4 mask1 = (uint4)(1, 0, 3, 2); + uint4 mask2 = (uint4)(2, 3, 0, 1); + uint4 mask3 = (uint4)(3, 2, 1, 0); + + int4 add1 = (int4)(1, 1, 3, 3); + int4 add2 = (int4)(2, 3, 2, 3); + int4 add3 = (int4)(1, 2, 2, 3); + + id = get_local_id(0) * 2; + global_start = get_group_id(0) * get_local_size(0) * 2 + id; + + input1 = g_data[global_start]; + input2 = g_data[global_start+1]; + + /* Sort input 1 - ascending */ + comp = input1 < shuffle(input1, mask1); + input1 = shuffle(input1, as_uint4(comp + add1)); + comp = input1 < shuffle(input1, mask2); + input1 = shuffle(input1, as_uint4(comp * 2 + add2)); + comp = input1 < shuffle(input1, mask3); + input1 = shuffle(input1, as_uint4(comp + add3)); + + /* Sort input 2 - descending */ + comp = input2 > shuffle(input2, mask1); + input2 = shuffle(input2, as_uint4(comp + add1)); + comp = input2 > shuffle(input2, mask2); + input2 = shuffle(input2, as_uint4(comp * 2 + add2)); + comp = input2 > shuffle(input2, mask3); + input2 = shuffle(input2, as_uint4(comp + add3)); + + /* Swap corresponding elements of input 1 and 2 */ + add3 = (int4)(4, 5, 6, 7); + dir = - (int)(get_local_id(0) % 2); + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + + /* Sort data and store in local memory */ + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + + /* Create bitonic set */ + for(size = 2; size < get_local_size(0); size <<= 1) { + dir = - (int)(get_local_id(0)/size & 1); + + for(stride = size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + } + + /* Perform bitonic merge */ + dir = - (int)(get_group_id(0) % 2); + for(stride = get_local_size(0); stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Perform final sort */ + id = get_local_id(0) * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + g_data[global_start] = input1; + g_data[global_start+1] = input2; +} + diff --git a/silx/resources/opencl/medfilt.cl b/silx/resources/opencl/medfilt.cl new file mode 100644 index 0000000..f1e342b --- /dev/null +++ b/silx/resources/opencl/medfilt.cl @@ -0,0 +1,141 @@ +/* + * Project: Azimuthal regroupping OpenCL kernel for PyFAI. + * Median filter for 1D, 2D and 3D datasets, only 2D for now + * + * + * Copyright (C) 2017-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * Last revision: 07/02/2017 + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +/* + * Needs to be concatenated with bitonic.cl prior to compilation +*/ + +/* + * Perform the 2D median filtering of an image 2D image + * + * dim0 => wg=number_of_element in the tile /8 + * dim1 = x: wg=1 + + * + * Actually the workgoup size is a bit more complicated: + * if window size = 1,3,5,7: WG1=8 + * if window size = 9,11,13,15: WG1=32 + * if window size = 17, ...,21: WG1=64 + * + * More Generally the workgroup size must be: at least: kfs2*(kfs1+7)/8 + * Each thread treats 8 values aligned vertically, this allows (almost) + * coalesced reading between threads in one line of the tile. + * + * Later on it will be more efficient to re-use the same tile + * and slide it vertically by one line. + * The additionnal need for shared memory will be kfs2 floats and a float8 as register. + * + * Theoritically, it should be possible to handle up to windows-size 83x83 + */ +__kernel void medfilt2d(__global float *image, // input image + __global float *result, // output array + __local float4 *l_data,// local storage 4x the number of threads + int khs1, // Kernel half-size along dim1 (nb lines) + int khs2, // Kernel half-size along dim2 (nb columns) + int height, // Image size along dim1 (nb lines) + int width) // Image size along dim2 (nb columns) +{ + int threadid = get_local_id(0); + int wg = get_local_size(0); + int x = get_global_id(1); + + if (x < width) + { + union + { + float ary[8]; + float8 vec; + } output, input; + input.vec = (float8)(MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT); + int kfs1 = 2 * khs1 + 1; //definition of kernel full size + int kfs2 = 2 * khs2 + 1; + int nbands = (kfs1 + 7) / 8; // 8 elements per thread, aligned vertically in 1 column + for (int y=0; y<height; y++) + { + //Select only the active threads, some may remain inactive + int nb_threads = (nbands * kfs2); + int band_nr = threadid / kfs2; + int band_id = threadid % kfs2; + int pos_x = clamp((int)(x + band_id - khs2), (int) 0, (int) width-1); + int max_vec = clamp(kfs1 - 8 * band_nr, 0, 8); + if (y == 0) + { + for (int i=0; i<max_vec; i++) + { + if (threadid<nb_threads) + { + int pos_y = clamp((int)(y + 8 * band_nr + i - khs1), (int) 0, (int) height-1); + input.ary[i] = image[pos_x + width * pos_y]; + } + } + } + else + { + //store storage.s0 to some shared memory to retrieve it from another thread. + l_data[threadid].s0 = input.vec.s0; + + //Offset to the bottom + input.vec = (float8)(input.vec.s1, + input.vec.s2, + input.vec.s3, + input.vec.s4, + input.vec.s5, + input.vec.s6, + input.vec.s7, + MAXFLOAT); + + barrier(CLK_LOCAL_MEM_FENCE); + + int read_from = threadid + kfs2; + if (read_from < nb_threads) + input.vec.s7 = l_data[read_from].s0; + else if (threadid < nb_threads) //we are on the last band + { + int pos_y = clamp((int)(y + 8 * band_nr + max_vec - 1 - khs1), (int) 0, (int) height-1); + input.ary[max_vec - 1] = image[pos_x + width * pos_y]; + } + + } + + //This function is defined in bitonic.cl + output.vec = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), + input.vec, l_data); + + size_t target = (kfs1 * kfs2) / 2; + if (threadid == (target / 8)) + { + result[y * width + x] = output.ary[target % 8]; + } + + } + } +} + diff --git a/silx/resources/opencl/preprocess.cl b/silx/resources/opencl/preprocess.cl new file mode 100644 index 0000000..de35c48 --- /dev/null +++ b/silx/resources/opencl/preprocess.cl @@ -0,0 +1,567 @@ +/* + * Project: Azimuthal regroupping OpenCL kernel for PyFAI. + * Preprocessing program + * + * + * Copyright (C) 2012-2017 European Synchrotron Radiation Facility + * Grenoble, France + * + * Principal authors: J. Kieffer (kieffer@esrf.fr) + * Last revision: 19/01/2017 + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/** + * \file + * + * \brief OpenCL kernels for image array casting, array mem-setting and normalizing + * + * Constant to be provided at build time: + * NIMAGE: size of the image + * + */ + +#include "for_eclipse.h" + +/** + * \brief cast values of an array of int8 into a float output array. + * + * - array_s8: Pointer to global memory with the input data as signed8 array + * - array_float: Pointer to global memory with the output data as float array + */ +__kernel void +s8_to_float(__global char *array_s8, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)array_s8[i]; +} + + +/** + * \brief cast values of an array of uint8 into a float output array. + * + * - array_u8: Pointer to global memory with the input data as unsigned8 array + * - array_float: Pointer to global memory with the output data as float array + */ +__kernel void +u8_to_float(__global unsigned char *array_u8, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)array_u8[i]; +} + + +/** + * \brief cast values of an array of int16 into a float output array. + * + * - array_s16: Pointer to global memory with the input data as signed16 array + * - array_float: Pointer to global memory with the output data as float array + */ +__kernel void +s16_to_float(__global short *array_s16, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)array_s16[i]; +} + + +/** + * \brief cast values of an array of uint16 into a float output array. + * + * - array_u16: Pointer to global memory with the input data as unsigned16 array + * - array_float: Pointer to global memory with the output data as float array + */ +__kernel void +u16_to_float(__global unsigned short *array_u16, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)array_u16[i]; +} + +/** + * \brief cast values of an array of uint32 into a float output array. + * + * - array_u32: Pointer to global memory with the input data as unsigned32 array + * - array_float: Pointer to global memory with the output data as float array + */ +__kernel void +u32_to_float(__global unsigned int *array_u32, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)array_u32[i]; +} + +/** + * \brief convert values of an array of int32 into a float output array. + * + * - array_int: Pointer to global memory with the data as unsigned32 array + * - array_float: Pointer to global memory with the data float array + */ +__kernel void +s32_to_float(__global int *array_int, + __global float *array_float +) +{ + int i = get_global_id(0); + //Global memory guard for padding + if (i < NIMAGE) + array_float[i] = (float)(array_int[i]); +} + + +/** + * Functions to be called from an actual kernel. + * \brief Performs the normalization of input image by dark subtraction, + * variance is propagated to second member of the float3 + * flatfield, solid angle, polarization and absorption are stored in + * third member of the float3 returned. + * + * Invalid/Dummy pixels will all have the third-member set to 0, i.e. no weight + * + * - image Float pointer to global memory storing the input image. + * - do_dark Bool/int: shall dark-current correction be applied ? + * - dark Float pointer to global memory storing the dark image. + * - do_flat Bool/int: shall flat-field correction be applied ? + * - flat Float pointer to global memory storing the flat image. + * - do_solidangle Bool/int: shall flat-field correction be applied ? + * - solidangle Float pointer to global memory storing the solid angle of each pixel. + * - do_polarization Bool/int: shall polarization correction be applied ? + * - polarization Float pointer to global memory storing the polarization of each pixel. + * - do_absorption Bool/int: shall absorption correction be applied ? + * - absorption Float pointer to global memory storing the effective absoption of each pixel. + * - do_mask perform mask correction ? + * - mask Bool/char pointer to mask array + * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored + * - dummy Float: value for bad pixels + * - delta_dummy Float: precision for bad pixel value + * - normalization_factor : divide the input by this value + * +**/ + +static float3 _preproc3(const __global float *image, + const __global float *variance, + const char do_dark, + const __global float *dark, + const char do_dark_variance, + const __global float *dark_variance, + const char do_flat, + const __global float *flat, + const char do_solidangle, + const __global float *solidangle, + const char do_polarization, + const __global float *polarization, + const char do_absorption, + const __global float *absorption, + const char do_mask, + const __global char *mask, + const char do_dummy, + const float dummy, + const float delta_dummy, + const float normalization_factor) +{ + size_t i= get_global_id(0); + float3 result = (float3)(0.0, 0.0, 0.0); + if (i < NIMAGE) + { + if ((!do_mask) || (!mask[i])) + { + result.s0 = image[i]; + if (variance != 0) + result.s1 = variance[i]; + result.s2 = normalization_factor; + if ( (!do_dummy) + ||((delta_dummy != 0.0f) && (fabs(result.s0-dummy) > delta_dummy)) + ||((delta_dummy == 0.0f) && (result.s0 != dummy))) + { + if (do_dark) + result.s0 -= dark[i]; + if (do_dark_variance) + result.s1 += dark_variance[i]; + if (do_flat) + { + float one_flat = flat[i]; + if ( (!do_dummy) + ||((delta_dummy != 0.0f) && (fabs(one_flat-dummy) > delta_dummy)) + ||((delta_dummy == 0.0f) && (one_flat != dummy))) + result.s2 *= one_flat; + else + result.s2 = 0.0f; + } + if (do_solidangle) + result.s2 *= solidangle[i]; + if (do_polarization) + result.s2 *= polarization[i]; + if (do_absorption) + result.s2 *= absorption[i]; + if (isnan(result.s0) || isnan(result.s1) || isnan(result.s2) || (result.s2 == 0.0f)) + result = (float3)(0.0, 0.0, 0.0); + } + else + { + result = (float3)(0.0, 0.0, 0.0); + }//end if do_dummy + } // end if mask + };//end if NIMAGE + return result; +};//end function + + +/** + * \brief Performs the normalization of input image by dark subtraction, + * flatfield, solid angle, polarization and absorption division. + * + * Intensities of images are corrected by: + * - dark (read-out) noise subtraction + * - Solid angle correction (division) + * - polarization correction (division) + * - flat fiels correction (division) + * Corrections are made in place unless the pixel is dummy. + * Dummy pixels are left untouched so that they remain dummy + * + * - image Float pointer to global memory storing the input image. + * - do_dark Bool/int: shall dark-current correction be applied ? + * - dark Float pointer to global memory storing the dark image. + * - do_flat Bool/int: shall flat-field correction be applied ? + * - flat Float pointer to global memory storing the flat image. + * - do_solidangle Bool/int: shall flat-field correction be applied ? + * - solidangle Float pointer to global memory storing the solid angle of each pixel. + * - do_polarization Bool/int: shall polarization correction be applied ? + * - polarization Float pointer to global memory storing the polarization of each pixel. + * - do_absorption Bool/int: shall absorption correction be applied ? + * - absorption Float pointer to global memory storing the effective absoption of each pixel. + * - do_mask perform mask correction ? + * - mask Bool/char pointer to mask array + * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored + * - dummy Float: value for bad pixels + * - delta_dummy Float: precision for bad pixel value + * - normalization_factor : divide the input by this value + * +**/ + +__kernel void +corrections(const __global float *image, + const char do_dark, + const __global float *dark, + const char do_flat, + const __global float *flat, + const char do_solidangle, + const __global float *solidangle, + const char do_polarization, + const __global float *polarization, + const char do_absorption, + const __global float *absorption, + const char do_mask, + const __global char *mask, + const char do_dummy, + const float dummy, + const float delta_dummy, + const float normalization_factor, + __global float *output + ) +{ + size_t i= get_global_id(0); + float3 result = (float3)(0.0, 0.0, 0.0); + if (i < NIMAGE) + { + result = _preproc3(image, + 0, + do_dark, + dark, + 0, + 0, + do_flat, + flat, + do_solidangle, + solidangle, + do_polarization, + polarization, + do_absorption, + absorption, + do_mask, + mask, + do_dummy, + dummy, + delta_dummy, + normalization_factor); + if (result.s2 != 0.0f) + output[i] = result.s0 / result.s2; + else + output[i] = dummy; + };//end if NIMAGE + +};//end kernel + + +/** + * \brief Performs Normalization of input image with float2 output (num,denom) + * + * Intensities of images are corrected by: + * - dark (read-out) noise subtraction for the data + * - Solid angle correction (denominator) + * - polarization correction (denominator) + * - flat fiels correction (denominator) + * + * Corrections are made out of place. + * Dummy pixels set both the numerator and denominator to 0 + * + * - image Float pointer to global memory storing the input image. + * - do_dark Bool/int: shall dark-current correction be applied ? + * - dark Float pointer to global memory storing the dark image. + * - do_flat Bool/int: shall flat-field correction be applied ? + * - flat Float pointer to global memory storing the flat image. + * - do_solidangle Bool/int: shall flat-field correction be applied ? + * - solidangle Float pointer to global memory storing the solid angle of each pixel. + * - do_polarization Bool/int: shall flat-field correction be applied ? + * - polarization Float pointer to global memory storing the polarization of each pixel. + * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored + * - dummy Float: value for bad pixels + * - delta_dummy Float: precision for bad pixel value + * - normalization_factor : divide the input by this value + * + * +**/ +__kernel void +corrections2(const __global float *image, + const char do_dark, + const __global float *dark, + const char do_flat, + const __global float *flat, + const char do_solidangle, + const __global float *solidangle, + const char do_polarization, + const __global float *polarization, + const char do_absorption, + const __global float *absorption, + const char do_mask, + const __global char *mask, + const char do_dummy, + const float dummy, + const float delta_dummy, + const float normalization_factor, + __global float2 *output + ) +{ + size_t i= get_global_id(0); + float3 result = (float3)(0.0, 0.0, 0.0); + if (i < NIMAGE) + { + result = _preproc3(image, + 0, + do_dark, + dark, + 0, + 0, + do_flat, + flat, + do_solidangle, + solidangle, + do_polarization, + polarization, + do_absorption, + absorption, + do_mask, + mask, + do_dummy, + dummy, + delta_dummy, + normalization_factor); + output[i] = (float2)(result.s0, result.s2); + };//end if NIMAGE +};//end kernel + +/** + * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) assuming poissonian signal + * + * Intensities of images are corrected by: + * - dark (read-out) noise subtraction for the data + * - Solid angle correction (denominator) + * - polarization correction (denominator) + * - flat fiels correction (denominator) + * + * Corrections are made out of place. + * Dummy pixels set both the numerator and denominator to 0 + * + * - image Float pointer to global memory storing the input image. + * - do_dark Bool/int: shall dark-current correction be applied ? + * - dark Float pointer to global memory storing the dark image. + * - do_flat Bool/int: shall flat-field correction be applied ? + * - flat Float pointer to global memory storing the flat image. + * - do_solidangle Bool/int: shall flat-field correction be applied ? + * - solidangle Float pointer to global memory storing the solid angle of each pixel. + * - do_polarization Bool/int: shall flat-field correction be applied ? + * - polarization Float pointer to global memory storing the polarization of each pixel. + * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored + * - dummy Float: value for bad pixels + * - delta_dummy Float: precision for bad pixel value + * - normalization_factor : divide the input by this value + * + * +**/ +__kernel void +corrections3Poisson( const __global float *image, + const char do_dark, + const __global float *dark, + const char do_flat, + const __global float *flat, + const char do_solidangle, + const __global float *solidangle, + const char do_polarization, + const __global float *polarization, + const char do_absorption, + const __global float *absorption, + const char do_mask, + const __global char *mask, + const char do_dummy, + const float dummy, + const float delta_dummy, + const float normalization_factor, + __global float3 *output + ) +{ + size_t i= get_global_id(0); + float3 result = (float3)(0.0, 0.0, 0.0); + if (i < NIMAGE) + { + result = _preproc3(image, + image, + do_dark, + dark, + do_dark, + dark, + do_flat, + flat, + do_solidangle, + solidangle, + do_polarization, + polarization, + do_absorption, + absorption, + do_mask, + mask, + do_dummy, + dummy, + delta_dummy, + normalization_factor); + output[i] = result; + };//end if NIMAGE +};//end kernel + + +/** + * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) + * + * Intensities of images are corrected by: + * - dark (read-out) noise subtraction for the data + * - Solid angle correction (division) + * - polarization correction (division) + * - flat fiels correction (division) + * Corrections are made in place unless the pixel is dummy. + * Dummy pixels are left untouched so that they remain dummy + * + * - image Float pointer to global memory storing the input image. + * - do_dark Bool/int: shall dark-current correction be applied ? + * - dark Float pointer to global memory storing the dark image. + * - do_flat Bool/int: shall flat-field correction be applied ? + * - flat Float pointer to global memory storing the flat image. + * - do_solidangle Bool/int: shall flat-field correction be applied ? + * - solidangle Float pointer to global memory storing the solid angle of each pixel. + * - do_polarization Bool/int: shall flat-field correction be applied ? + * - polarization Float pointer to global memory storing the polarization of each pixel. + * - do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored + * - dummy Float: value for bad pixels + * - delta_dummy Float: precision for bad pixel value + * - normalization_factor : divide the input by this value + * + * +**/ + +__kernel void +corrections3(const __global float *image, + const __global float *variance, + const char do_dark, + const __global float *dark, + const char do_dark_variance, + const __global float *dark_variance, + const char do_flat, + const __global float *flat, + const char do_solidangle, + const __global float *solidangle, + const char do_polarization, + const __global float *polarization, + const char do_absorption, + const __global float *absorption, + const char do_mask, + const __global char *mask, + const char do_dummy, + const float dummy, + const float delta_dummy, + const float normalization_factor, + __global float3 *output + ) +{ + size_t i= get_global_id(0); + float3 result = (float3)(0.0, 0.0, 0.0); + if (i < NIMAGE) + { + result = _preproc3( image, + variance, + do_dark, + dark, + do_dark_variance, + dark_variance, + do_flat, + flat, + do_solidangle, + solidangle, + do_polarization, + polarization, + do_absorption, + absorption, + do_mask, + mask, + do_dummy, + dummy, + delta_dummy, + normalization_factor); + output[i] = result; + };//end if NIMAGE +};//end kernel + + |