diff options
Diffstat (limited to 'src/silx/resources/opencl/bitonic.cl')
-rw-r--r-- | src/silx/resources/opencl/bitonic.cl | 569 |
1 files changed, 569 insertions, 0 deletions
diff --git a/src/silx/resources/opencl/bitonic.cl b/src/silx/resources/opencl/bitonic.cl new file mode 100644 index 0000000..4096ce8 --- /dev/null +++ b/src/silx/resources/opencl/bitonic.cl @@ -0,0 +1,569 @@ +/*############################################################################ +# Sort elements within a vector by Matthew Scarpino, +# Taken from his book "OpenCL in Action", +# November 2011 ISBN 9781617290176 +# Original license for the code: "public domain" +# +# Originally this code is public domain. The MIT license has been added +# by J. Kieffer (jerome.kieffer@esrf.eu) to provide a disclaimer. +# J. Kieffer does not claim authorship of this code developed by . +# +# Copyright (c) 2011 Matthew Scarpino +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +#############################################################################*/ + +#define VECTOR_SORT_BOOK(input, dir) \ + comp = abs(input > shuffle(input, mask2)) ^ dir; \ + input = shuffle(input, comp * 2 + add2); \ + comp = abs(input > shuffle(input, mask1)) ^ dir; \ + input = shuffle(input, comp + add1); \ + + +#define VECTOR_SWAP_BOOK(in1, in2, dir) \ + input1 = in1; input2 = in2; \ + comp = (abs(input1 > input2) ^ dir) * 4 + add3; \ + in1 = shuffle2(input1, input2, comp); \ + in2 = shuffle2(input2, input1, comp); \ + + +// The _FILE extension correspond to the formula found in the "OpenCL in Action" supplementary files +#define VECTOR_SORT_FILE(input, dir)\ + comp = (input < shuffle(input, mask2)) ^ dir;\ + input = shuffle(input, as_uint4(comp * 2 + add2));\ + comp = (input < shuffle(input, mask1)) ^ dir;\ + input = shuffle(input, as_uint4(comp + add1));\ + + +#define VECTOR_SWAP_FILE(input1, input2, dir)\ + temp = input1;\ + comp = ((input1 < input2) ^ dir) * 4 + add3;\ + input1 = shuffle2(input1, input2, as_uint4(comp));\ + input2 = shuffle2(input2, temp, as_uint4(comp));\ + + + +// Functions to be called from an actual kernel. + +static float8 my_sort_file(uint local_id, uint group_id, uint local_size, + float8 input, local float4 *l_data) +{ + float4 input1, input2, temp; + float8 output; + + int dir; + uint id, size, stride; + int4 comp; + + uint4 mask1 = (uint4)(1, 0, 3, 2); + uint4 mask2 = (uint4)(2, 3, 0, 1); + uint4 mask3 = (uint4)(3, 2, 1, 0); + + int4 add1 = (int4)(1, 1, 3, 3); + int4 add2 = (int4)(2, 3, 2, 3); + int4 add3 = (int4)(1, 2, 2, 3); + + // retrieve input data + input1 = (float4)(input.s0, input.s1, input.s2, input.s3); + input2 = (float4)(input.s4, input.s5, input.s6, input.s7); + + // Find global address + id = local_id * 2; + + /* Sort input 1 - ascending */ + comp = input1 < shuffle(input1, mask1); + input1 = shuffle(input1, as_uint4(comp + add1)); + comp = input1 < shuffle(input1, mask2); + input1 = shuffle(input1, as_uint4(comp * 2 + add2)); + comp = input1 < shuffle(input1, mask3); + input1 = shuffle(input1, as_uint4(comp + add3)); + + /* Sort input 2 - descending */ + comp = input2 > shuffle(input2, mask1); + input2 = shuffle(input2, as_uint4(comp + add1)); + comp = input2 > shuffle(input2, mask2); + input2 = shuffle(input2, as_uint4(comp * 2 + add2)); + comp = input2 > shuffle(input2, mask3); + input2 = shuffle(input2, as_uint4(comp + add3)); + + /* Swap corresponding elements of input 1 and 2 */ + add3 = (int4)(4, 5, 6, 7); + dir = - (int) (local_id % 2); + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + + /* Sort data and store in local memory */ + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + barrier(CLK_LOCAL_MEM_FENCE); + + /* Create bitonic set */ + for(size = 2; size < local_size; size <<= 1) { + dir = - (int) (local_id/size & 1) ; + + for(stride = size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + barrier(CLK_LOCAL_MEM_FENCE); + } + + /* Perform bitonic merge */ + dir = - (int) (group_id % 2); + for(stride = local_size; stride > 1; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Perform final sort */ + id = local_id * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + + // setup output and return it + output = (float8)(input1, input2); + return output; +} + +static float8 my_sort_book(uint local_id, uint group_id, uint local_size, + float8 input, local float4 *l_data) +{ + float4 input1, input2, temp; + float8 output; + uint4 comp, swap, mask1, mask2, add1, add2, add3; + uint id, dir, size, stride; + mask1 = (uint4)(1, 0, 3, 2); + swap = (uint4)(0, 0, 1, 1); + add1 = (uint4)(0, 0, 2, 2); + mask2 = (uint4)(2, 3, 0, 1); + add2 = (uint4)(0, 1, 0, 1); + add3 = (uint4)(0, 1, 2, 3); + + // retrieve input data + input1 = (float4)(input.s0, input.s1, input.s2, input.s3); + input2 = (float4)(input.s4, input.s5, input.s6, input.s7); + + // Find global address + id = local_id * 2; + + //Sort first vector + + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp ^ swap + add1); + comp = abs(input1 > shuffle(input1, mask2)); + input1 = shuffle(input1, comp * 2 + add2); + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp + add1); + + //Sort second vector + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp ^ swap + add1); + comp = abs(input2 < shuffle(input2, mask2)); + input2 = shuffle(input2, comp * 2 + add2); + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp + add1); + + // Swap elements + dir = local_id % 2; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + + // Perform upper stages + for(size = 2; size < local_size; size <<= 1) + { + dir = local_id/size & 1; + + //Perform lower stages + for(stride = size; stride > 1; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = local_id * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + } + dir = group_id % 2; + + // Perform bitonic merge + for(stride = local_size; stride > 1; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + id = local_id + (local_id/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = local_id * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + + // setup output and return it + output = (float8)(input1, input2); + return output; +} + + + +////////////// +// Kernels +////////////// + +// Perform the sort on the whole array +// dim0: wg=number_of_element/8 + +kernel void bsort_all(global float4 *g_data, + local float4 *l_data) +{ + float4 input1, input2; + float8 input, output; + uint id, global_start; + // Find global address + id = get_local_id(0) * 2; + global_start = get_group_id(0) * get_local_size(0) * 2 + id; + + input1 = g_data[global_start]; + input2 = g_data[global_start+1]; + input = (float8)(input1, input2); + output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), + input, l_data); + input1 = (float4) (output.s0, output.s1, output.s2, output.s3); + input2 = (float4) (output.s4, output.s5, output.s6, output.s7); + g_data[global_start] = input1; + g_data[global_start+1] = input2; +} + + +// Perform the sort along the horizontal axis of a 2D image +// dim0 = y: wg=1 +// dim1 = x: wg=number_of_element/8 +kernel void bsort_horizontal(global float *g_data, + local float4 *l_data) +{ + float8 input, output; + uint id, global_start, offset; + + // Find global address + offset = get_global_size(1)*get_global_id(0)*8; + id = get_local_id(1) * 8; + global_start = offset + get_group_id(1) * get_local_size(1) * 8 + id; + + input = (float8)(g_data[global_start ], + g_data[global_start + 1], + g_data[global_start + 2], + g_data[global_start + 3], + g_data[global_start + 4], + g_data[global_start + 5], + g_data[global_start + 6], + g_data[global_start + 7]); + + output = my_sort_file(get_local_id(1), get_group_id(1), get_local_size(1), + input, l_data); + + g_data[global_start ] = output.s0; + g_data[global_start + 1] = output.s1; + g_data[global_start + 2] = output.s2; + g_data[global_start + 3] = output.s3; + g_data[global_start + 4] = output.s4; + g_data[global_start + 5] = output.s5; + g_data[global_start + 6] = output.s6; + g_data[global_start + 7] = output.s7; +} + + +// Perform the sort along the vertical axis +// dim0 = y: wg=number_of_element/8 +// dim1 = x: wg=1 +// check if transposing +bsort_horizontal is not more efficient ? + +kernel void bsort_vertical(global float *g_data, + local float4 *l_data) +{ + // we need to read 8 float position along the vertical axis + float8 input, output; + uint id, global_start, padding; + + // Find global address + padding = get_global_size(1); + id = get_local_id(0) * 8 * padding + get_global_id(1); + global_start = get_group_id(0) * get_local_size(0) * 8 * padding + id; + + input = (float8)(g_data[global_start ], + g_data[global_start + padding ], + g_data[global_start + 2*padding], + g_data[global_start + 3*padding], + g_data[global_start + 4*padding], + g_data[global_start + 5*padding], + g_data[global_start + 6*padding], + g_data[global_start + 7*padding]); + + output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), + input, l_data); + g_data[global_start ] = output.s0; + g_data[global_start + padding ] = output.s1; + g_data[global_start + 2*padding ] = output.s2; + g_data[global_start + 3*padding ] = output.s3; + g_data[global_start + 4*padding ] = output.s4; + g_data[global_start + 5*padding ] = output.s5; + g_data[global_start + 6*padding ] = output.s6; + g_data[global_start + 7*padding ] = output.s7; +} + + +//Tested working reference kernel frm the book. This only works under Linux +kernel void bsort_book(global float4 *g_data, + local float4 *l_data) { + float4 input1, input2, temp; + uint4 comp, swap, mask1, mask2, add1, add2, add3; + uint id, dir, global_start, size, stride; + mask1 = (uint4)(1, 0, 3, 2); + swap = (uint4)(0, 0, 1, 1); + add1 = (uint4)(0, 0, 2, 2); + mask2 = (uint4)(2, 3, 0, 1); + add2 = (uint4)(0, 1, 0, 1); + add3 = (uint4)(0, 1, 2, 3); + + // Find global address + id = get_local_id(0) * 2; + global_start = get_group_id(0) * get_local_size(0) * 2 + id; + + //Sort first vector + input1 = g_data[global_start]; + input2 = g_data[global_start+1]; + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp ^ swap + add1); + comp = abs(input1 > shuffle(input1, mask2)); + input1 = shuffle(input1, comp * 2 + add2); + comp = abs(input1 > shuffle(input1, mask1)); + input1 = shuffle(input1, comp + add1); + + //Sort second vector + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp ^ swap + add1); + comp = abs(input2 < shuffle(input2, mask2)); + input2 = shuffle(input2, comp * 2 + add2); + comp = abs(input2 < shuffle(input2, mask1)); + input2 = shuffle(input2, comp + add1); + + // Swap elements + dir = get_local_id(0) % 2; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + + // Perform upper stages + for(size = 2; size < get_local_size(0); size <<= 1) { + dir = get_local_id(0)/size & 1; + + //Perform lower stages + for(stride = size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = get_local_id(0) * 2; + input1 = l_data[id]; + input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + } + dir = get_group_id(0) % 2; + // Perform bitonic merge + for(stride = get_local_size(0); stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = get_local_id(0) * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = (abs(input1 > input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, comp); + input2 = shuffle2(input2, temp, comp); + VECTOR_SORT_BOOK(input1, dir); + VECTOR_SORT_BOOK(input2, dir); + g_data[global_start] = input1; + g_data[global_start+1] = input2; + } + +//Tested working reference kernel from the addition files. This only works under any operating system +/* Perform initial sort */ +kernel void bsort_file(global float4 *g_data, local float4 *l_data) { + + int dir; + uint id, global_start, size, stride; + float4 input1, input2, temp; + int4 comp; + + uint4 mask1 = (uint4)(1, 0, 3, 2); + uint4 mask2 = (uint4)(2, 3, 0, 1); + uint4 mask3 = (uint4)(3, 2, 1, 0); + + int4 add1 = (int4)(1, 1, 3, 3); + int4 add2 = (int4)(2, 3, 2, 3); + int4 add3 = (int4)(1, 2, 2, 3); + + id = get_local_id(0) * 2; + global_start = get_group_id(0) * get_local_size(0) * 2 + id; + + input1 = g_data[global_start]; + input2 = g_data[global_start+1]; + + /* Sort input 1 - ascending */ + comp = input1 < shuffle(input1, mask1); + input1 = shuffle(input1, as_uint4(comp + add1)); + comp = input1 < shuffle(input1, mask2); + input1 = shuffle(input1, as_uint4(comp * 2 + add2)); + comp = input1 < shuffle(input1, mask3); + input1 = shuffle(input1, as_uint4(comp + add3)); + + /* Sort input 2 - descending */ + comp = input2 > shuffle(input2, mask1); + input2 = shuffle(input2, as_uint4(comp + add1)); + comp = input2 > shuffle(input2, mask2); + input2 = shuffle(input2, as_uint4(comp * 2 + add2)); + comp = input2 > shuffle(input2, mask3); + input2 = shuffle(input2, as_uint4(comp + add3)); + + /* Swap corresponding elements of input 1 and 2 */ + add3 = (int4)(4, 5, 6, 7); + dir = - (int)(get_local_id(0) % 2); + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + + /* Sort data and store in local memory */ + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + + /* Create bitonic set */ + for(size = 2; size < get_local_size(0); size <<= 1) { + dir = - (int)(get_local_id(0)/size & 1); + + for(stride = size; stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + l_data[id] = input1; + l_data[id+1] = input2; + } + + /* Perform bitonic merge */ + dir = - (int)(get_group_id(0) % 2); + for(stride = get_local_size(0); stride > 1; stride >>= 1) { + barrier(CLK_LOCAL_MEM_FENCE); + id = get_local_id(0) + (get_local_id(0)/stride)*stride; + VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Perform final sort */ + id = get_local_id(0) * 2; + input1 = l_data[id]; input2 = l_data[id+1]; + temp = input1; + comp = ((input1 < input2) ^ dir) * 4 + add3; + input1 = shuffle2(input1, input2, as_uint4(comp)); + input2 = shuffle2(input2, temp, as_uint4(comp)); + VECTOR_SORT_FILE(input1, dir); + VECTOR_SORT_FILE(input2, dir); + g_data[global_start] = input1; + g_data[global_start+1] = input2; +} + |