/*############################################################################ # Sort elements within a vector by Matthew Scarpino, # Taken from his book "OpenCL in Action", # November 2011 ISBN 9781617290176 # Original license for the code: "public domain" # # Originally this code is public domain. The MIT license has been added # by J. Kieffer (jerome.kieffer@esrf.eu) to provide a disclaimer. # J. Kieffer does not claim authorship of this code developed by . # # Copyright (c) 2011 Matthew Scarpino # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # #############################################################################*/ #define VECTOR_SORT_BOOK(input, dir) \ comp = abs(input > shuffle(input, mask2)) ^ dir; \ input = shuffle(input, comp * 2 + add2); \ comp = abs(input > shuffle(input, mask1)) ^ dir; \ input = shuffle(input, comp + add1); \ #define VECTOR_SWAP_BOOK(in1, in2, dir) \ input1 = in1; input2 = in2; \ comp = (abs(input1 > input2) ^ dir) * 4 + add3; \ in1 = shuffle2(input1, input2, comp); \ in2 = shuffle2(input2, input1, comp); \ // The _FILE extension correspond to the formula found in the "OpenCL in Action" supplementary files #define VECTOR_SORT_FILE(input, dir) \ comp = (input < shuffle(input, mask2)) ^ dir; \ input = shuffle(input, as_uint4(comp * 2 + add2)); \ comp = (input < shuffle(input, mask1)) ^ dir; \ input = shuffle(input, as_uint4(comp + add1)); \ #define VECTOR_SWAP_FILE(input1, input2, dir) \ temp = input1; \ comp = ((input1 < input2) ^ dir) * 4 + add3; \ input1 = shuffle2(input1, input2, as_uint4(comp)); \ input2 = shuffle2(input2, temp, as_uint4(comp)); \ // Functions to be called from an actual kernel. static float8 my_sort_file(uint local_id, uint group_id, uint local_size, float8 input, __local float4 *l_data){ float4 input1, input2, temp; float8 output; int dir; uint id, size, stride; int4 comp; uint4 mask1 = (uint4)(1, 0, 3, 2); uint4 mask2 = (uint4)(2, 3, 0, 1); uint4 mask3 = (uint4)(3, 2, 1, 0); int4 add1 = (int4)(1, 1, 3, 3); int4 add2 = (int4)(2, 3, 2, 3); int4 add3 = (int4)(1, 2, 2, 3); // retrieve input data input1 = (float4)(input.s0, input.s1, input.s2, input.s3); input2 = (float4)(input.s4, input.s5, input.s6, input.s7); // Find global address id = local_id * 2; /* Sort input 1 - ascending */ comp = input1 < shuffle(input1, mask1); input1 = shuffle(input1, as_uint4(comp + add1)); comp = input1 < shuffle(input1, mask2); input1 = shuffle(input1, as_uint4(comp * 2 + add2)); comp = input1 < shuffle(input1, mask3); input1 = shuffle(input1, as_uint4(comp + add3)); /* Sort input 2 - descending */ comp = input2 > shuffle(input2, mask1); input2 = shuffle(input2, as_uint4(comp + add1)); comp = input2 > shuffle(input2, mask2); input2 = shuffle(input2, as_uint4(comp * 2 + add2)); comp = input2 > shuffle(input2, mask3); input2 = shuffle(input2, as_uint4(comp + add3)); /* Swap corresponding elements of input 1 and 2 */ add3 = (int4)(4, 5, 6, 7); dir = - (int) (local_id % 2); temp = input1; comp = ((input1 < input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); /* Sort data and store in local memory */ VECTOR_SORT_FILE(input1, dir); VECTOR_SORT_FILE(input2, dir); l_data[id] = input1; l_data[id+1] = input2; /* Create bitonic set */ for(size = 2; size < local_size; size <<= 1) { dir = - (int) (local_id/size & 1) ; for(stride = size; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = local_id + (local_id/stride)*stride; VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); id = local_id * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = ((input1 < input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT_FILE(input1, dir); VECTOR_SORT_FILE(input2, dir); l_data[id] = input1; l_data[id+1] = input2; } /* Perform bitonic merge */ dir = - (int) (group_id % 2); for(stride = local_size; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = local_id + (local_id/stride)*stride; VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); /* Perform final sort */ id = local_id * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = ((input1 < input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT_FILE(input1, dir); VECTOR_SORT_FILE(input2, dir); // setup output and return it output = (float8)(input1, input2); return output; } static float8 my_sort_book(uint local_id, uint group_id, uint local_size, float8 input, __local float4 *l_data){ float4 input1, input2, temp; float8 output; uint4 comp, swap, mask1, mask2, add1, add2, add3; uint id, dir, size, stride; mask1 = (uint4)(1, 0, 3, 2); swap = (uint4)(0, 0, 1, 1); add1 = (uint4)(0, 0, 2, 2); mask2 = (uint4)(2, 3, 0, 1); add2 = (uint4)(0, 1, 0, 1); add3 = (uint4)(0, 1, 2, 3); // retrieve input data input1 = (float4)(input.s0, input.s1, input.s2, input.s3); input2 = (float4)(input.s4, input.s5, input.s6, input.s7); // Find global address id = local_id * 2; //Sort first vector comp = abs(input1 > shuffle(input1, mask1)); input1 = shuffle(input1, comp ^ swap + add1); comp = abs(input1 > shuffle(input1, mask2)); input1 = shuffle(input1, comp * 2 + add2); comp = abs(input1 > shuffle(input1, mask1)); input1 = shuffle(input1, comp + add1); //Sort second vector comp = abs(input2 < shuffle(input2, mask1)); input2 = shuffle(input2, comp ^ swap + add1); comp = abs(input2 < shuffle(input2, mask2)); input2 = shuffle(input2, comp * 2 + add2); comp = abs(input2 < shuffle(input2, mask1)); input2 = shuffle(input2, comp + add1); // Swap elements dir = local_id % 2; temp = input1; comp = (abs(input1 > input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, comp); input2 = shuffle2(input2, temp, comp); VECTOR_SORT_BOOK(input1, dir); VECTOR_SORT_BOOK(input2, dir); l_data[id] = input1; l_data[id+1] = input2; // Perform upper stages for(size = 2; size < local_size; size <<= 1) { dir = local_id/size & 1; //Perform lower stages for(stride = size; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = local_id + (local_id/stride)*stride; VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); id = local_id * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (abs(input1 > input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, comp); input2 = shuffle2(input2, temp, comp); VECTOR_SORT_BOOK(input1, dir); VECTOR_SORT_BOOK(input2, dir); l_data[id] = input1; l_data[id+1] = input2; } dir = group_id % 2; // Perform bitonic merge for(stride = local_size; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = local_id + (local_id/stride)*stride; VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); id = local_id * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (abs(input1 > input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, comp); input2 = shuffle2(input2, temp, comp); VECTOR_SORT_BOOK(input1, dir); VECTOR_SORT_BOOK(input2, dir); // setup output and return it output = (float8)(input1, input2); return output; } ////////////// // Kernels ////////////// // Perform the sort on the whole array // dim0: wg=number_of_element/8 __kernel void bsort_all(__global float4 *g_data, __local float4 *l_data) { float4 input1, input2; float8 input, output; uint id, global_start; // Find global address id = get_local_id(0) * 2; global_start = get_group_id(0) * get_local_size(0) * 2 + id; input1 = g_data[global_start]; input2 = g_data[global_start+1]; input = (float8)(input1, input2); output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), input, l_data); input1 = (float4) (output.s0, output.s1, output.s2, output.s3); input2 = (float4) (output.s4, output.s5, output.s6, output.s7); g_data[global_start] = input1; g_data[global_start+1] = input2; } // Perform the sort along the horizontal axis of a 2D image // dim0 = y: wg=1 // dim1 = x: wg=number_of_element/8 __kernel void bsort_horizontal(__global float *g_data, __local float4 *l_data) { float8 input, output; uint id, global_start, offset; // Find global address offset = get_global_size(1)*get_global_id(0)*8; id = get_local_id(1) * 8; global_start = offset + get_group_id(1) * get_local_size(1) * 8 + id; input = (float8)(g_data[global_start ], g_data[global_start + 1], g_data[global_start + 2], g_data[global_start + 3], g_data[global_start + 4], g_data[global_start + 5], g_data[global_start + 6], g_data[global_start + 7]); output = my_sort_file(get_local_id(1), get_group_id(1), get_local_size(1), input, l_data); g_data[global_start ] = output.s0; g_data[global_start + 1] = output.s1; g_data[global_start + 2] = output.s2; g_data[global_start + 3] = output.s3; g_data[global_start + 4] = output.s4; g_data[global_start + 5] = output.s5; g_data[global_start + 6] = output.s6; g_data[global_start + 7] = output.s7; } // Perform the sort along the vertical axis // dim0 = y: wg=number_of_element/8 // dim1 = x: wg=1 // check if transposing +bsort_horizontal is not more efficient ? __kernel void bsort_vertical(__global float *g_data, __local float4 *l_data) { // we need to read 8 float position along the vertical axis float8 input, output; uint id, global_start, padding; // Find global address padding = get_global_size(1); id = get_local_id(0) * 8 * padding + get_global_id(1); global_start = get_group_id(0) * get_local_size(0) * 8 * padding + id; input = (float8)(g_data[global_start ], g_data[global_start + padding ], g_data[global_start + 2*padding], g_data[global_start + 3*padding], g_data[global_start + 4*padding], g_data[global_start + 5*padding], g_data[global_start + 6*padding], g_data[global_start + 7*padding]); output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0), input, l_data); g_data[global_start ] = output.s0; g_data[global_start + padding ] = output.s1; g_data[global_start + 2*padding ] = output.s2; g_data[global_start + 3*padding ] = output.s3; g_data[global_start + 4*padding ] = output.s4; g_data[global_start + 5*padding ] = output.s5; g_data[global_start + 6*padding ] = output.s6; g_data[global_start + 7*padding ] = output.s7; } //Tested working reference kernel frm the book. This only works under Linux __kernel void bsort_book(__global float4 *g_data, __local float4 *l_data) { float4 input1, input2, temp; uint4 comp, swap, mask1, mask2, add1, add2, add3; uint id, dir, global_start, size, stride; mask1 = (uint4)(1, 0, 3, 2); swap = (uint4)(0, 0, 1, 1); add1 = (uint4)(0, 0, 2, 2); mask2 = (uint4)(2, 3, 0, 1); add2 = (uint4)(0, 1, 0, 1); add3 = (uint4)(0, 1, 2, 3); // Find global address id = get_local_id(0) * 2; global_start = get_group_id(0) * get_local_size(0) * 2 + id; //Sort first vector input1 = g_data[global_start]; input2 = g_data[global_start+1]; comp = abs(input1 > shuffle(input1, mask1)); input1 = shuffle(input1, comp ^ swap + add1); comp = abs(input1 > shuffle(input1, mask2)); input1 = shuffle(input1, comp * 2 + add2); comp = abs(input1 > shuffle(input1, mask1)); input1 = shuffle(input1, comp + add1); //Sort second vector comp = abs(input2 < shuffle(input2, mask1)); input2 = shuffle(input2, comp ^ swap + add1); comp = abs(input2 < shuffle(input2, mask2)); input2 = shuffle(input2, comp * 2 + add2); comp = abs(input2 < shuffle(input2, mask1)); input2 = shuffle(input2, comp + add1); // Swap elements dir = get_local_id(0) % 2; temp = input1; comp = (abs(input1 > input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, comp); input2 = shuffle2(input2, temp, comp); VECTOR_SORT_BOOK(input1, dir); VECTOR_SORT_BOOK(input2, dir); l_data[id] = input1; l_data[id+1] = input2; // Perform upper stages for(size = 2; size < get_local_size(0); size <<= 1) { dir = get_local_id(0)/size & 1; //Perform lower stages for(stride = size; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (abs(input1 > input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, comp); input2 = shuffle2(input2, temp, comp); VECTOR_SORT_BOOK(input1, dir); VECTOR_SORT_BOOK(input2, dir); l_data[id] = input1; l_data[id+1] = input2; } dir = get_group_id(0) % 2; // Perform bitonic merge for(stride = get_local_size(0); stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (abs(input1 > input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, comp); input2 = shuffle2(input2, temp, comp); VECTOR_SORT_BOOK(input1, dir); VECTOR_SORT_BOOK(input2, dir); g_data[global_start] = input1; g_data[global_start+1] = input2; } //Tested working reference kernel from the addition files. This only works under any operating system /* Perform initial sort */ __kernel void bsort_file(__global float4 *g_data, __local float4 *l_data) { int dir; uint id, global_start, size, stride; float4 input1, input2, temp; int4 comp; uint4 mask1 = (uint4)(1, 0, 3, 2); uint4 mask2 = (uint4)(2, 3, 0, 1); uint4 mask3 = (uint4)(3, 2, 1, 0); int4 add1 = (int4)(1, 1, 3, 3); int4 add2 = (int4)(2, 3, 2, 3); int4 add3 = (int4)(1, 2, 2, 3); id = get_local_id(0) * 2; global_start = get_group_id(0) * get_local_size(0) * 2 + id; input1 = g_data[global_start]; input2 = g_data[global_start+1]; /* Sort input 1 - ascending */ comp = input1 < shuffle(input1, mask1); input1 = shuffle(input1, as_uint4(comp + add1)); comp = input1 < shuffle(input1, mask2); input1 = shuffle(input1, as_uint4(comp * 2 + add2)); comp = input1 < shuffle(input1, mask3); input1 = shuffle(input1, as_uint4(comp + add3)); /* Sort input 2 - descending */ comp = input2 > shuffle(input2, mask1); input2 = shuffle(input2, as_uint4(comp + add1)); comp = input2 > shuffle(input2, mask2); input2 = shuffle(input2, as_uint4(comp * 2 + add2)); comp = input2 > shuffle(input2, mask3); input2 = shuffle(input2, as_uint4(comp + add3)); /* Swap corresponding elements of input 1 and 2 */ add3 = (int4)(4, 5, 6, 7); dir = - (int)(get_local_id(0) % 2); temp = input1; comp = ((input1 < input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); /* Sort data and store in local memory */ VECTOR_SORT_FILE(input1, dir); VECTOR_SORT_FILE(input2, dir); l_data[id] = input1; l_data[id+1] = input2; /* Create bitonic set */ for(size = 2; size < get_local_size(0); size <<= 1) { dir = - (int)(get_local_id(0)/size & 1); for(stride = size; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = ((input1 < input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT_FILE(input1, dir); VECTOR_SORT_FILE(input2, dir); l_data[id] = input1; l_data[id+1] = input2; } /* Perform bitonic merge */ dir = - (int)(get_group_id(0) % 2); for(stride = get_local_size(0); stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); /* Perform final sort */ id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = ((input1 < input2) ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT_FILE(input1, dir); VECTOR_SORT_FILE(input2, dir); g_data[global_start] = input1; g_data[global_start+1] = input2; }