Import Upstream version 0.5.0+dfsg

author: Picca Frédéric-Emmanuel <picca@synchrotron-soleil.fr> 2017-08-18 14:48:52 +0200
committer: Picca Frédéric-Emmanuel <picca@synchrotron-soleil.fr> 2017-08-18 14:48:52 +0200
commit: f7bdc2acff3c13a6d632c28c4569690ab106eed7 (patch)
tree: 9d67cdb7152ee4e711379e03fe0546c7c3b97303 /silx/resources/opencl
4 files changed, 1301 insertions, 0 deletions
diff --git a/silx/resources/opencl/addition.cl b/silx/resources/opencl/addition.cl
new file mode 100644
index 0000000..8ecfd4e
--- /dev/null
+++ b/silx/resources/opencl/addition.cl
@@ -0,0 +1,36 @@
+/*
+ *   Project: SIFT: An algorithm for image alignement
+ *
+ *   Copyright (C) 2013-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+__kernel void addition(__global float* a, __global float* b, __global float* res, int N)
+{
+    unsigned int i = get_global_id(0);
+    if( i<N ){
+        res[i] = a[i] + b[i];
+    }
+}
+\ No newline at end of file
diff --git a/silx/resources/opencl/bitonic.cl b/silx/resources/opencl/bitonic.cl
new file mode 100644
index 0000000..f1cb55c
--- /dev/null
+++ b/silx/resources/opencl/bitonic.cl
@@ -0,0 +1,557 @@
+/*############################################################################
+# Sort elements within a vector by Matthew Scarpino,
+# Taken from his book "OpenCL in Action",
+# November 2011 ISBN 9781617290176
+# Original license for the code: "public domain"
+#
+# Originally this code is public domain. The MIT license has been added
+# by J. Kieffer (jerome.kieffer@esrf.eu) to provide a disclaimer.
+# J. Kieffer does not claim authorship of this code developed by .
+#
+# Copyright (c) 2011 Matthew Scarpino
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+#############################################################################*/
+
+#define VECTOR_SORT_BOOK(input, dir) \
+        comp = abs(input > shuffle(input, mask2)) ^ dir; \
+        input = shuffle(input, comp * 2 + add2); \
+        comp = abs(input > shuffle(input, mask1)) ^ dir; \
+        input = shuffle(input, comp + add1); \
+
+
+#define VECTOR_SWAP_BOOK(in1, in2, dir) \
+        input1 = in1; input2 = in2; \
+        comp = (abs(input1 > input2) ^ dir) * 4 + add3; \
+        in1 = shuffle2(input1, input2, comp); \
+        in2 = shuffle2(input2, input1, comp); \
+
+
+// The _FILE extension correspond to the formula found in the "OpenCL in Action" supplementary files
+#define VECTOR_SORT_FILE(input, dir)                              \
+        comp = (input < shuffle(input, mask2)) ^ dir;             \
+        input = shuffle(input, as_uint4(comp * 2 + add2));        \
+        comp = (input < shuffle(input, mask1)) ^ dir;             \
+        input = shuffle(input, as_uint4(comp + add1));            \
+
+
+#define VECTOR_SWAP_FILE(input1, input2, dir)                     \
+        temp = input1;                                            \
+        comp = ((input1 < input2) ^ dir) * 4 + add3;              \
+        input1 = shuffle2(input1, input2, as_uint4(comp));        \
+        input2 = shuffle2(input2, temp, as_uint4(comp));          \
+
+
+
+// Functions to be called from an actual kernel.
+
+static float8 my_sort_file(uint local_id, uint group_id, uint local_size,
+                           float8 input, __local float4 *l_data){
+    float4 input1, input2, temp;
+    float8 output;
+
+	int dir;
+	uint id, size, stride;
+	int4 comp;
+
+	uint4 mask1 = (uint4)(1, 0, 3, 2);
+	uint4 mask2 = (uint4)(2, 3, 0, 1);
+	uint4 mask3 = (uint4)(3, 2, 1, 0);
+
+	int4 add1 = (int4)(1, 1, 3, 3);
+	int4 add2 = (int4)(2, 3, 2, 3);
+	int4 add3 = (int4)(1, 2, 2, 3);
+
+    // retrieve input data
+    input1 = (float4)(input.s0, input.s1, input.s2, input.s3);
+    input2 = (float4)(input.s4, input.s5, input.s6, input.s7);
+
+    // Find global address
+    id = local_id * 2;
+
+	/* Sort input 1 - ascending */
+	comp = input1 < shuffle(input1, mask1);
+	input1 = shuffle(input1, as_uint4(comp + add1));
+	comp = input1 < shuffle(input1, mask2);
+	input1 = shuffle(input1, as_uint4(comp * 2 + add2));
+	comp = input1 < shuffle(input1, mask3);
+	input1 = shuffle(input1, as_uint4(comp + add3));
+
+	/* Sort input 2 - descending */
+	comp = input2 > shuffle(input2, mask1);
+	input2 = shuffle(input2, as_uint4(comp + add1));
+	comp = input2 > shuffle(input2, mask2);
+	input2 = shuffle(input2, as_uint4(comp * 2 + add2));
+	comp = input2 > shuffle(input2, mask3);
+	input2 = shuffle(input2, as_uint4(comp + add3));
+
+	/* Swap corresponding elements of input 1 and 2 */
+	add3 = (int4)(4, 5, 6, 7);
+	dir = - (int) (local_id % 2);
+	temp = input1;
+	comp = ((input1 < input2) ^ dir) * 4 + add3;
+	input1 = shuffle2(input1, input2, as_uint4(comp));
+	input2 = shuffle2(input2, temp, as_uint4(comp));
+
+	/* Sort data and store in local memory */
+	VECTOR_SORT_FILE(input1, dir);
+	VECTOR_SORT_FILE(input2, dir);
+	l_data[id] = input1;
+	l_data[id+1] = input2;
+
+	/* Create bitonic set */
+	for(size = 2; size < local_size; size <<= 1) {
+	  dir = - (int) (local_id/size & 1) ;
+
+	  for(stride = size; stride > 1; stride >>= 1) {
+		 barrier(CLK_LOCAL_MEM_FENCE);
+		 id = local_id + (local_id/stride)*stride;
+		 VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+	  }
+
+	  barrier(CLK_LOCAL_MEM_FENCE);
+	  id = local_id * 2;
+	  input1 = l_data[id];
+	  input2 = l_data[id+1];
+	  temp = input1;
+	  comp = ((input1 < input2) ^ dir) * 4 + add3;
+	  input1 = shuffle2(input1, input2, as_uint4(comp));
+	  input2 = shuffle2(input2, temp, as_uint4(comp));
+	  VECTOR_SORT_FILE(input1, dir);
+	  VECTOR_SORT_FILE(input2, dir);
+	  l_data[id] = input1;
+	  l_data[id+1] = input2;
+	}
+
+	/* Perform bitonic merge */
+	dir = - (int) (group_id % 2);
+	for(stride = local_size; stride > 1; stride >>= 1) {
+	  barrier(CLK_LOCAL_MEM_FENCE);
+	  id = local_id + (local_id/stride)*stride;
+	  VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Perform final sort */
+	id = local_id * 2;
+	input1 = l_data[id]; input2 = l_data[id+1];
+	temp = input1;
+	comp = ((input1 < input2) ^ dir) * 4 + add3;
+	input1 = shuffle2(input1, input2, as_uint4(comp));
+	input2 = shuffle2(input2, temp, as_uint4(comp));
+	VECTOR_SORT_FILE(input1, dir);
+	VECTOR_SORT_FILE(input2, dir);
+
+	// setup output and return it
+	output = (float8)(input1, input2);
+	return  output;
+}
+
+static float8 my_sort_book(uint local_id, uint group_id, uint local_size,
+                           float8 input, __local float4 *l_data){
+    float4 input1, input2, temp;
+    float8 output;
+    uint4 comp, swap, mask1, mask2, add1, add2, add3;
+    uint id, dir, size, stride;
+    mask1 = (uint4)(1, 0, 3, 2);
+    swap = (uint4)(0, 0, 1, 1);
+    add1 = (uint4)(0, 0, 2, 2);
+    mask2 = (uint4)(2, 3, 0, 1);
+    add2 = (uint4)(0, 1, 0, 1);
+    add3 = (uint4)(0, 1, 2, 3);
+
+    // retrieve input data
+    input1 = (float4)(input.s0, input.s1, input.s2, input.s3);
+    input2 = (float4)(input.s4, input.s5, input.s6, input.s7);
+
+    // Find global address
+    id = local_id * 2;
+
+    //Sort first vector
+
+    comp = abs(input1 > shuffle(input1, mask1));
+    input1 = shuffle(input1, comp ^ swap + add1);
+    comp = abs(input1 > shuffle(input1, mask2));
+    input1 = shuffle(input1, comp * 2 + add2);
+    comp = abs(input1 > shuffle(input1, mask1));
+    input1 = shuffle(input1, comp + add1);
+
+    //Sort second vector
+    comp = abs(input2 < shuffle(input2, mask1));
+    input2 = shuffle(input2, comp ^ swap + add1);
+    comp = abs(input2 < shuffle(input2, mask2));
+    input2 = shuffle(input2, comp * 2 + add2);
+    comp = abs(input2 < shuffle(input2, mask1));
+    input2 = shuffle(input2, comp + add1);
+
+    // Swap elements
+    dir = local_id % 2;
+    temp = input1;
+    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+    input1 = shuffle2(input1, input2, comp);
+    input2 = shuffle2(input2, temp, comp);
+    VECTOR_SORT_BOOK(input1, dir);
+    VECTOR_SORT_BOOK(input2, dir);
+    l_data[id] = input1;
+    l_data[id+1] = input2;
+
+    // Perform upper stages
+    for(size = 2; size < local_size;    size <<= 1) {
+        dir = local_id/size & 1;
+
+        //Perform    lower stages
+        for(stride = size; stride > 1; stride >>= 1) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            id = local_id +    (local_id/stride)*stride;
+            VECTOR_SWAP_BOOK(l_data[id],    l_data[id + stride], dir)
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        id = local_id * 2;
+        input1 = l_data[id];
+        input2 = l_data[id+1];
+        temp = input1;
+        comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+        input1 = shuffle2(input1, input2, comp);
+        input2 = shuffle2(input2, temp, comp);
+        VECTOR_SORT_BOOK(input1, dir);
+        VECTOR_SORT_BOOK(input2, dir);
+        l_data[id] = input1;
+        l_data[id+1] = input2;
+    }
+    dir = group_id % 2;
+
+    // Perform bitonic merge
+    for(stride = local_size; stride > 1; stride >>= 1) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        id = local_id +    (local_id/stride)*stride;
+        VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    id = local_id * 2;
+    input1 = l_data[id]; input2 = l_data[id+1];
+    temp = input1;
+    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+    input1 = shuffle2(input1, input2, comp);
+    input2 = shuffle2(input2, temp, comp);
+    VECTOR_SORT_BOOK(input1, dir);
+    VECTOR_SORT_BOOK(input2, dir);
+
+    // setup output and return it
+    output = (float8)(input1, input2);
+    return  output;
+}
+
+
+
+//////////////
+// Kernels
+//////////////
+
+// Perform the sort on the whole array
+// dim0: wg=number_of_element/8
+
+__kernel void bsort_all(__global float4 *g_data,
+                        __local float4 *l_data) {
+    float4 input1, input2;
+    float8 input, output;
+    uint id, global_start;
+    // Find global address
+    id = get_local_id(0) * 2;
+    global_start = get_group_id(0) * get_local_size(0) * 2 + id;
+
+    input1 = g_data[global_start];
+    input2 = g_data[global_start+1];
+    input = (float8)(input1, input2);
+    output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
+                          input, l_data);
+    input1 = (float4) (output.s0, output.s1, output.s2, output.s3);
+    input2 = (float4) (output.s4, output.s5, output.s6, output.s7);
+    g_data[global_start] = input1;
+    g_data[global_start+1] = input2;
+}
+
+
+// Perform the sort along the horizontal axis of a 2D image
+// dim0 = y: wg=1
+// dim1 = x: wg=number_of_element/8
+__kernel void bsort_horizontal(__global float *g_data,
+                               __local  float4 *l_data) {
+    float8 input, output;
+    uint id, global_start, offset;
+
+    // Find global address
+    offset = get_global_size(1)*get_global_id(0)*8;
+    id = get_local_id(1) * 8;
+    global_start = offset + get_group_id(1) * get_local_size(1) * 8 + id;
+
+    input = (float8)(g_data[global_start    ],
+                     g_data[global_start + 1],
+                     g_data[global_start + 2],
+                     g_data[global_start + 3],
+                     g_data[global_start + 4],
+                     g_data[global_start + 5],
+                     g_data[global_start + 6],
+                     g_data[global_start + 7]);
+
+    output = my_sort_file(get_local_id(1), get_group_id(1), get_local_size(1),
+                          input, l_data);
+
+    g_data[global_start    ] = output.s0;
+    g_data[global_start + 1] = output.s1;
+    g_data[global_start + 2] = output.s2;
+    g_data[global_start + 3] = output.s3;
+    g_data[global_start + 4] = output.s4;
+    g_data[global_start + 5] = output.s5;
+    g_data[global_start + 6] = output.s6;
+    g_data[global_start + 7] = output.s7;
+}
+
+
+// Perform the sort along the vertical axis
+// dim0 = y: wg=number_of_element/8
+// dim1 = x: wg=1
+// check if transposing +bsort_horizontal is not more efficient ?
+
+__kernel void bsort_vertical(__global float *g_data,
+                             __local float4 *l_data) {
+    // we need to read 8 float position along the vertical axis
+    float8 input, output;
+    uint id, global_start, padding;
+
+    // Find global address
+    padding = get_global_size(1);
+    id = get_local_id(0) * 8 * padding + get_global_id(1);
+    global_start = get_group_id(0) * get_local_size(0) * 8 * padding + id;
+
+	input = (float8)(g_data[global_start            ],
+                     g_data[global_start + padding  ],
+                     g_data[global_start + 2*padding],
+                     g_data[global_start + 3*padding],
+                     g_data[global_start + 4*padding],
+                     g_data[global_start + 5*padding],
+                     g_data[global_start + 6*padding],
+                     g_data[global_start + 7*padding]);
+
+      output = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
+                            input, l_data);
+      g_data[global_start             ] = output.s0;
+      g_data[global_start + padding   ] = output.s1;
+      g_data[global_start + 2*padding ] = output.s2;
+      g_data[global_start + 3*padding ] = output.s3;
+      g_data[global_start + 4*padding ] = output.s4;
+      g_data[global_start + 5*padding ] = output.s5;
+      g_data[global_start + 6*padding ] = output.s6;
+      g_data[global_start + 7*padding ] = output.s7;
+}
+
+
+//Tested working reference kernel frm the book. This only works under Linux
+__kernel void bsort_book(__global float4 *g_data,
+                         __local float4 *l_data) {
+    float4 input1, input2, temp;
+    uint4 comp, swap, mask1, mask2, add1, add2, add3;
+    uint id, dir, global_start, size, stride;
+    mask1 = (uint4)(1, 0, 3, 2);
+    swap = (uint4)(0, 0, 1, 1);
+    add1 = (uint4)(0, 0, 2, 2);
+    mask2 = (uint4)(2, 3, 0, 1);
+    add2 = (uint4)(0, 1, 0, 1);
+    add3 = (uint4)(0, 1, 2, 3);
+
+    // Find global address
+    id = get_local_id(0) * 2;
+    global_start = get_group_id(0) * get_local_size(0) * 2 + id;
+
+    //Sort first vector
+    input1 = g_data[global_start];
+    input2 = g_data[global_start+1];
+    comp = abs(input1 > shuffle(input1, mask1));
+    input1 = shuffle(input1, comp ^ swap + add1);
+    comp = abs(input1 > shuffle(input1, mask2));
+    input1 = shuffle(input1, comp * 2 + add2);
+    comp = abs(input1 > shuffle(input1, mask1));
+    input1 = shuffle(input1, comp + add1);
+
+    //Sort second vector
+    comp = abs(input2 < shuffle(input2, mask1));
+    input2 = shuffle(input2, comp ^ swap + add1);
+    comp = abs(input2 < shuffle(input2, mask2));
+    input2 = shuffle(input2, comp * 2 + add2);
+    comp = abs(input2 < shuffle(input2, mask1));
+    input2 = shuffle(input2, comp + add1);
+
+    // Swap elements
+    dir = get_local_id(0) % 2;
+    temp = input1;
+    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+    input1 = shuffle2(input1, input2, comp);
+    input2 = shuffle2(input2, temp, comp);
+    VECTOR_SORT_BOOK(input1, dir);
+    VECTOR_SORT_BOOK(input2, dir);
+    l_data[id] = input1;
+    l_data[id+1] = input2;
+
+    // Perform upper stages
+    for(size = 2; size < get_local_size(0);    size <<= 1) {
+        dir = get_local_id(0)/size & 1;
+
+        //Perform    lower stages
+        for(stride = size; stride > 1; stride >>= 1) {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            id = get_local_id(0) +
+            (get_local_id(0)/stride)*stride;
+            VECTOR_SWAP_BOOK(l_data[id],    l_data[id + stride], dir)
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        id = get_local_id(0) * 2;
+        input1 = l_data[id];
+        input2 = l_data[id+1];
+        temp = input1;
+        comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+        input1 = shuffle2(input1, input2, comp);
+        input2 = shuffle2(input2, temp, comp);
+        VECTOR_SORT_BOOK(input1, dir);
+        VECTOR_SORT_BOOK(input2, dir);
+        l_data[id] = input1;
+        l_data[id+1] = input2;
+    }
+    dir = get_group_id(0) % 2;
+    // Perform bitonic merge
+    for(stride = get_local_size(0); stride > 1;    stride >>= 1) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        id = get_local_id(0) +
+        (get_local_id(0)/stride)*stride;
+        VECTOR_SWAP_BOOK(l_data[id], l_data[id + stride], dir)
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    id = get_local_id(0) * 2;
+    input1 = l_data[id]; input2 = l_data[id+1];
+    temp = input1;
+    comp = (abs(input1 > input2) ^ dir) * 4 + add3;
+    input1 = shuffle2(input1, input2, comp);
+    input2 = shuffle2(input2, temp, comp);
+    VECTOR_SORT_BOOK(input1, dir);
+    VECTOR_SORT_BOOK(input2, dir);
+    g_data[global_start] = input1;
+    g_data[global_start+1] = input2;
+    }
+
+//Tested working reference kernel from the addition files. This only works under any operating system
+/* Perform initial sort */
+__kernel void bsort_file(__global float4 *g_data, __local float4 *l_data) {
+
+   int dir;
+   uint id, global_start, size, stride;
+   float4 input1, input2, temp;
+   int4 comp;
+
+   uint4 mask1 = (uint4)(1, 0, 3, 2);
+   uint4 mask2 = (uint4)(2, 3, 0, 1);
+   uint4 mask3 = (uint4)(3, 2, 1, 0);
+
+   int4 add1 = (int4)(1, 1, 3, 3);
+   int4 add2 = (int4)(2, 3, 2, 3);
+   int4 add3 = (int4)(1, 2, 2, 3);
+
+   id = get_local_id(0) * 2;
+   global_start = get_group_id(0) * get_local_size(0) * 2 + id;
+
+   input1 = g_data[global_start];
+   input2 = g_data[global_start+1];
+
+   /* Sort input 1 - ascending */
+   comp = input1 < shuffle(input1, mask1);
+   input1 = shuffle(input1, as_uint4(comp + add1));
+   comp = input1 < shuffle(input1, mask2);
+   input1 = shuffle(input1, as_uint4(comp * 2 + add2));
+   comp = input1 < shuffle(input1, mask3);
+   input1 = shuffle(input1, as_uint4(comp + add3));
+
+   /* Sort input 2 - descending */
+   comp = input2 > shuffle(input2, mask1);
+   input2 = shuffle(input2, as_uint4(comp + add1));
+   comp = input2 > shuffle(input2, mask2);
+   input2 = shuffle(input2, as_uint4(comp * 2 + add2));
+   comp = input2 > shuffle(input2, mask3);
+   input2 = shuffle(input2, as_uint4(comp + add3));
+
+   /* Swap corresponding elements of input 1 and 2 */
+   add3 = (int4)(4, 5, 6, 7);
+   dir = - (int)(get_local_id(0) % 2);
+   temp = input1;
+   comp = ((input1 < input2) ^ dir) * 4 + add3;
+   input1 = shuffle2(input1, input2, as_uint4(comp));
+   input2 = shuffle2(input2, temp, as_uint4(comp));
+
+   /* Sort data and store in local memory */
+   VECTOR_SORT_FILE(input1, dir);
+   VECTOR_SORT_FILE(input2, dir);
+   l_data[id] = input1;
+   l_data[id+1] = input2;
+
+   /* Create bitonic set */
+   for(size = 2; size < get_local_size(0); size <<= 1) {
+      dir = - (int)(get_local_id(0)/size & 1);
+
+      for(stride = size; stride > 1; stride >>= 1) {
+         barrier(CLK_LOCAL_MEM_FENCE);
+         id = get_local_id(0) + (get_local_id(0)/stride)*stride;
+         VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+      }
+
+      barrier(CLK_LOCAL_MEM_FENCE);
+      id = get_local_id(0) * 2;
+      input1 = l_data[id]; input2 = l_data[id+1];
+      temp = input1;
+      comp = ((input1 < input2) ^ dir) * 4 + add3;
+      input1 = shuffle2(input1, input2, as_uint4(comp));
+      input2 = shuffle2(input2, temp, as_uint4(comp));
+      VECTOR_SORT_FILE(input1, dir);
+      VECTOR_SORT_FILE(input2, dir);
+      l_data[id] = input1;
+      l_data[id+1] = input2;
+   }
+
+   /* Perform bitonic merge */
+   dir = - (int)(get_group_id(0) % 2);
+   for(stride = get_local_size(0); stride > 1; stride >>= 1) {
+      barrier(CLK_LOCAL_MEM_FENCE);
+      id = get_local_id(0) + (get_local_id(0)/stride)*stride;
+      VECTOR_SWAP_FILE(l_data[id], l_data[id + stride], dir)
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   /* Perform final sort */
+   id = get_local_id(0) * 2;
+   input1 = l_data[id]; input2 = l_data[id+1];
+   temp = input1;
+   comp = ((input1 < input2) ^ dir) * 4 + add3;
+   input1 = shuffle2(input1, input2, as_uint4(comp));
+   input2 = shuffle2(input2, temp, as_uint4(comp));
+   VECTOR_SORT_FILE(input1, dir);
+   VECTOR_SORT_FILE(input2, dir);
+   g_data[global_start] = input1;
+   g_data[global_start+1] = input2;
+}
+
diff --git a/silx/resources/opencl/medfilt.cl b/silx/resources/opencl/medfilt.cl
new file mode 100644
index 0000000..f1e342b
--- /dev/null
+++ b/silx/resources/opencl/medfilt.cl
@@ -0,0 +1,141 @@
+/*
+ *   Project: Azimuthal regroupping OpenCL kernel for PyFAI.
+ *            Median filter for 1D, 2D and 3D datasets, only 2D for now
+ *
+ *
+ *   Copyright (C) 2017-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *   Last revision: 07/02/2017
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+/*
+ * Needs to be concatenated with bitonic.cl prior to compilation
+*/
+
+/*
+ *  Perform the 2D median filtering of an image 2D image
+ *
+ * dim0 => wg=number_of_element in the tile /8
+ * dim1 = x: wg=1
+
+ *
+ * Actually the workgoup size is a bit more complicated:
+ * if window size = 1,3,5,7: WG1=8
+ * if window size = 9,11,13,15: WG1=32
+ * if window size = 17, ...,21: WG1=64
+ *
+ * More Generally the workgroup size must be: at least: kfs2*(kfs1+7)/8
+ * Each thread treats 8 values aligned vertically, this allows (almost)
+ * coalesced reading between threads in one line of the tile.
+ *
+ * Later on it will be more efficient to re-use the same tile
+ * and slide it vertically by one line.
+ * The additionnal need for shared memory will be kfs2 floats and a float8 as register.
+ *
+ * Theoritically, it should be possible to handle up to windows-size 83x83
+ */
+__kernel void medfilt2d(__global float *image,  // input image
+                        __global float *result, // output array
+                        __local  float4 *l_data,// local storage 4x the number of threads
+                                 int khs1,      // Kernel half-size along dim1 (nb lines)
+                                 int khs2,      // Kernel half-size along dim2 (nb columns)
+                                 int height,    // Image size along dim1 (nb lines)
+                                 int width)     // Image size along dim2 (nb columns)
+{
+    int threadid = get_local_id(0);
+    int wg = get_local_size(0);
+    int x = get_global_id(1);
+
+    if (x < width)
+    {
+        union
+        {
+            float  ary[8];
+            float8 vec;
+        } output, input;
+        input.vec = (float8)(MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT);
+        int kfs1 = 2 * khs1 + 1; //definition of kernel full size
+        int kfs2 = 2 * khs2 + 1;
+        int nbands = (kfs1 + 7) / 8; // 8 elements per thread, aligned vertically in 1 column
+        for (int y=0; y<height; y++)
+        {
+            //Select only the active threads, some may remain inactive
+            int nb_threads =  (nbands * kfs2);
+            int band_nr = threadid / kfs2;
+            int band_id = threadid % kfs2;
+            int pos_x = clamp((int)(x + band_id - khs2), (int) 0, (int) width-1);
+            int max_vec = clamp(kfs1 - 8 * band_nr, 0, 8);
+            if (y == 0)
+            {
+                for (int i=0; i<max_vec; i++)
+                {
+                    if (threadid<nb_threads)
+                    {
+                        int pos_y = clamp((int)(y + 8 * band_nr + i - khs1), (int) 0, (int) height-1);
+                        input.ary[i] = image[pos_x + width * pos_y];
+                    }
+                }
+            }
+            else
+            {
+                //store storage.s0 to some shared memory to retrieve it from another thread.
+                l_data[threadid].s0 = input.vec.s0;
+
+                //Offset to the bottom
+                input.vec = (float8)(input.vec.s1,
+                        input.vec.s2,
+                        input.vec.s3,
+                        input.vec.s4,
+                        input.vec.s5,
+                        input.vec.s6,
+                        input.vec.s7,
+                        MAXFLOAT);
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                int read_from = threadid + kfs2;
+                if (read_from < nb_threads)
+                    input.vec.s7 = l_data[read_from].s0;
+                else if (threadid < nb_threads) //we are on the last band
+                {
+                    int pos_y = clamp((int)(y + 8 * band_nr + max_vec - 1 - khs1), (int) 0, (int) height-1);
+                    input.ary[max_vec - 1] = image[pos_x + width * pos_y];
+                }
+
+            }
+
+            //This function is defined in bitonic.cl
+            output.vec = my_sort_file(get_local_id(0), get_group_id(0), get_local_size(0),
+                                       input.vec, l_data);
+
+            size_t target = (kfs1 * kfs2) / 2;
+            if (threadid == (target / 8))
+            {
+                result[y * width + x] = output.ary[target % 8];
+            }
+
+        }
+    }
+}
+
diff --git a/silx/resources/opencl/preprocess.cl b/silx/resources/opencl/preprocess.cl
new file mode 100644
index 0000000..de35c48
--- /dev/null
+++ b/silx/resources/opencl/preprocess.cl
@@ -0,0 +1,567 @@
+/*
+ *   Project: Azimuthal regroupping OpenCL kernel for PyFAI.
+ *            Preprocessing program
+ *
+ *
+ *   Copyright (C) 2012-2017 European Synchrotron Radiation Facility
+ *                           Grenoble, France
+ *
+ *   Principal authors: J. Kieffer (kieffer@esrf.fr)
+ *   Last revision: 19/01/2017
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/**
+ * \file
+ *
+ * \brief OpenCL kernels for image array casting, array mem-setting and normalizing
+ *
+ * Constant to be provided at build time:
+ *   NIMAGE: size of the image
+ *
+ */
+
+#include "for_eclipse.h"
+
+/**
+ * \brief cast values of an array of int8 into a float output array.
+ *
+ * - array_s8: Pointer to global memory with the input data as signed8 array
+ * - array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+s8_to_float(__global char  *array_s8,
+            __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)array_s8[i];
+}
+
+
+/**
+ * \brief cast values of an array of uint8 into a float output array.
+ *
+ * - array_u8: Pointer to global memory with the input data as unsigned8 array
+ * - array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+u8_to_float(__global unsigned char  *array_u8,
+            __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)array_u8[i];
+}
+
+
+/**
+ * \brief cast values of an array of int16 into a float output array.
+ *
+ * - array_s16: Pointer to global memory with the input data as signed16 array
+ * - array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+s16_to_float(__global short *array_s16,
+             __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)array_s16[i];
+}
+
+
+/**
+ * \brief cast values of an array of uint16 into a float output array.
+ *
+ * - array_u16: Pointer to global memory with the input data as unsigned16 array
+ * - array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+u16_to_float(__global unsigned short  *array_u16,
+             __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)array_u16[i];
+}
+
+/**
+ * \brief cast values of an array of uint32 into a float output array.
+ *
+ * - array_u32: Pointer to global memory with the input data as unsigned32 array
+ * - array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+u32_to_float(__global unsigned int  *array_u32,
+             __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)array_u32[i];
+}
+
+/**
+ * \brief convert values of an array of int32 into a float output array.
+ *
+ * - array_int:  Pointer to global memory with the data as unsigned32 array
+ * - array_float:  Pointer to global memory with the data float array
+ */
+__kernel void
+s32_to_float(__global int  *array_int,
+             __global float  *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if (i < NIMAGE)
+    array_float[i] = (float)(array_int[i]);
+}
+
+
+/**
+ * Functions to be called from an actual kernel.
+ * \brief Performs the normalization of input image by dark subtraction,
+ *        variance is propagated to second member of the float3
+ *        flatfield, solid angle, polarization and absorption are stored in
+ *        third member of the float3 returned.
+ *
+ * Invalid/Dummy pixels will all have the third-member set to 0, i.e. no weight
+ *
+ * - image           Float pointer to global memory storing the input image.
+ * - do_dark         Bool/int: shall dark-current correction be applied ?
+ * - dark            Float pointer to global memory storing the dark image.
+ * - do_flat         Bool/int: shall flat-field correction be applied ?
+ * - flat            Float pointer to global memory storing the flat image.
+ * - do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall polarization correction be applied ?
+ * - polarization    Float pointer to global memory storing the polarization of each pixel.
+ * - do_absorption   Bool/int: shall absorption correction be applied ?
+ * - absorption      Float pointer to global memory storing the effective absoption of each pixel.
+ * - do_mask         perform mask correction ?
+ * - mask            Bool/char pointer to mask array
+ * - do_dummy        Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy           Float: value for bad pixels
+ * - delta_dummy     Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+**/
+
+static float3 _preproc3(const __global float  *image,
+                        const __global float  *variance,
+                        const          char   do_dark,
+                        const __global float  *dark,
+                        const          char   do_dark_variance,
+                        const __global float  *dark_variance,
+                        const          char   do_flat,
+                        const __global float  *flat,
+                        const          char   do_solidangle,
+                        const __global float  *solidangle,
+                        const          char   do_polarization,
+                        const __global float  *polarization,
+                        const          char   do_absorption,
+                        const __global float  *absorption,
+                        const          char   do_mask,
+                        const __global char   *mask,
+                        const          char   do_dummy,
+                        const          float  dummy,
+                        const          float  delta_dummy,
+                        const          float  normalization_factor)
+{
+    size_t i= get_global_id(0);
+    float3 result = (float3)(0.0, 0.0, 0.0);
+    if (i < NIMAGE)
+    {
+        if ((!do_mask) || (!mask[i]))
+        {
+            result.s0 = image[i];
+            if (variance != 0)
+                result.s1 = variance[i];
+            result.s2 = normalization_factor;
+            if ( (!do_dummy)
+                  ||((delta_dummy != 0.0f) && (fabs(result.s0-dummy) > delta_dummy))
+                  ||((delta_dummy == 0.0f) && (result.s0 != dummy)))
+            {
+                if (do_dark)
+                    result.s0 -= dark[i];
+                if (do_dark_variance)
+                    result.s1 += dark_variance[i];
+                if (do_flat)
+                {
+                    float one_flat = flat[i];
+                    if ( (!do_dummy)
+                         ||((delta_dummy != 0.0f) && (fabs(one_flat-dummy) > delta_dummy))
+                         ||((delta_dummy == 0.0f) && (one_flat != dummy)))
+                        result.s2 *= one_flat;
+                    else
+                        result.s2 = 0.0f;
+                }
+                if (do_solidangle)
+                    result.s2 *= solidangle[i];
+                if (do_polarization)
+                    result.s2 *= polarization[i];
+                if (do_absorption)
+                    result.s2 *= absorption[i];
+                if (isnan(result.s0) || isnan(result.s1) || isnan(result.s2) || (result.s2 == 0.0f))
+                    result = (float3)(0.0, 0.0, 0.0);
+            }
+            else
+            {
+                result = (float3)(0.0, 0.0, 0.0);
+            }//end if do_dummy
+        } // end if mask
+    };//end if NIMAGE
+    return result;
+};//end function
+
+
+/**
+ * \brief Performs the normalization of input image by dark subtraction,
+ *        flatfield, solid angle, polarization and absorption division.
+ *
+ * Intensities of images are corrected by:
+ *  - dark (read-out) noise subtraction
+ *  - Solid angle correction (division)
+ *  - polarization correction (division)
+ *  - flat fiels correction (division)
+ * Corrections are made in place unless the pixel is dummy.
+ * Dummy pixels are left untouched so that they remain dummy
+ *
+ * - image           Float pointer to global memory storing the input image.
+ * - do_dark         Bool/int: shall dark-current correction be applied ?
+ * - dark            Float pointer to global memory storing the dark image.
+ * - do_flat         Bool/int: shall flat-field correction be applied ?
+ * - flat            Float pointer to global memory storing the flat image.
+ * - do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall polarization correction be applied ?
+ * - polarization    Float pointer to global memory storing the polarization of each pixel.
+ * - do_absorption   Bool/int: shall absorption correction be applied ?
+ * - absorption      Float pointer to global memory storing the effective absoption of each pixel.
+ * - do_mask         perform mask correction ?
+ * - mask            Bool/char pointer to mask array
+ * - do_dummy        Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy           Float: value for bad pixels
+ * - delta_dummy     Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+**/
+
+__kernel void
+corrections(const __global float  *image,
+            const          char   do_dark,
+            const __global float  *dark,
+            const          char   do_flat,
+            const __global float  *flat,
+            const          char   do_solidangle,
+            const __global float  *solidangle,
+            const          char   do_polarization,
+            const __global float  *polarization,
+			const          char   do_absorption,
+			const __global float  *absorption,
+            const          char   do_mask,
+            const __global char   *mask,
+            const          char   do_dummy,
+            const          float  dummy,
+            const          float  delta_dummy,
+            const          float  normalization_factor,
+                  __global float  *output
+            )
+{
+    size_t i= get_global_id(0);
+    float3 result = (float3)(0.0, 0.0, 0.0);
+    if (i < NIMAGE)
+    {
+        result = _preproc3(image,
+                            0,
+                            do_dark,
+                            dark,
+                            0,
+                            0,
+                            do_flat,
+                            flat,
+                            do_solidangle,
+                            solidangle,
+                            do_polarization,
+                            polarization,
+                            do_absorption,
+                            absorption,
+                            do_mask,
+                            mask,
+                            do_dummy,
+                            dummy,
+                            delta_dummy,
+                            normalization_factor);
+        if (result.s2 != 0.0f)
+            output[i] = result.s0 / result.s2;
+        else
+            output[i] = dummy;
+    };//end if NIMAGE
+
+};//end kernel
+
+
+/**
+ * \brief Performs Normalization of input image with float2 output (num,denom)
+ *
+ * Intensities of images are corrected by:
+ *  - dark (read-out) noise subtraction for the data
+ *  - Solid angle correction (denominator)
+ *  - polarization correction (denominator)
+ *  - flat fiels correction (denominator)
+ *
+ * Corrections are made out of place.
+ * Dummy pixels set both the numerator and denominator to 0
+ *
+ * - image           Float pointer to global memory storing the input image.
+ * - do_dark         Bool/int: shall dark-current correction be applied ?
+ * - dark            Float pointer to global memory storing the dark image.
+ * - do_flat         Bool/int: shall flat-field correction be applied ?
+ * - flat            Float pointer to global memory storing the flat image.
+ * - do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall flat-field correction be applied ?
+ * - polarization    Float pointer to global memory storing the polarization of each pixel.
+ * - do_dummy          Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy             Float: value for bad pixels
+ * - delta_dummy       Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+ *
+**/
+__kernel void
+corrections2(const __global float  *image,
+             const          char   do_dark,
+             const __global float  *dark,
+             const          char   do_flat,
+             const __global float  *flat,
+             const          char   do_solidangle,
+             const __global float  *solidangle,
+             const          char   do_polarization,
+             const __global float  *polarization,
+             const          char   do_absorption,
+             const __global float  *absorption,
+             const          char   do_mask,
+             const __global char   *mask,
+             const          char   do_dummy,
+             const          float  dummy,
+             const          float  delta_dummy,
+             const          float  normalization_factor,
+                   __global float2  *output
+            )
+{
+    size_t i= get_global_id(0);
+    float3 result = (float3)(0.0, 0.0, 0.0);
+    if (i < NIMAGE)
+    {
+        result = _preproc3(image,
+                            0,
+                            do_dark,
+                            dark,
+                            0,
+                            0,
+                            do_flat,
+                            flat,
+                            do_solidangle,
+                            solidangle,
+                            do_polarization,
+                            polarization,
+                            do_absorption,
+                            absorption,
+                            do_mask,
+                            mask,
+                            do_dummy,
+                            dummy,
+                            delta_dummy,
+                            normalization_factor);
+        output[i] = (float2)(result.s0, result.s2);
+    };//end if NIMAGE
+};//end kernel
+
+/**
+ * \brief Performs Normalization of input image with float3 output (signal, variance, normalization) assuming poissonian signal
+ *
+ * Intensities of images are corrected by:
+ *  - dark (read-out) noise subtraction for the data
+ *  - Solid angle correction (denominator)
+ *  - polarization correction (denominator)
+ *  - flat fiels correction (denominator)
+ *
+ * Corrections are made out of place.
+ * Dummy pixels set both the numerator and denominator to 0
+ *
+ * - image           Float pointer to global memory storing the input image.
+ * - do_dark         Bool/int: shall dark-current correction be applied ?
+ * - dark            Float pointer to global memory storing the dark image.
+ * - do_flat         Bool/int: shall flat-field correction be applied ?
+ * - flat            Float pointer to global memory storing the flat image.
+ * - do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall flat-field correction be applied ?
+ * - polarization    Float pointer to global memory storing the polarization of each pixel.
+ * - do_dummy          Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy             Float: value for bad pixels
+ * - delta_dummy       Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+ *
+**/
+__kernel void
+corrections3Poisson( const __global float  *image,
+                     const          char   do_dark,
+                     const __global float  *dark,
+                     const          char   do_flat,
+                     const __global float  *flat,
+                     const          char   do_solidangle,
+                     const __global float  *solidangle,
+                     const          char   do_polarization,
+                     const __global float  *polarization,
+                     const          char   do_absorption,
+                     const __global float  *absorption,
+                     const          char   do_mask,
+                     const __global char   *mask,
+                     const          char   do_dummy,
+                     const          float  dummy,
+                     const          float  delta_dummy,
+                     const          float  normalization_factor,
+                           __global float3  *output
+            )
+{
+    size_t i= get_global_id(0);
+    float3 result = (float3)(0.0, 0.0, 0.0);
+    if (i < NIMAGE)
+    {
+        result = _preproc3(image,
+                           image,
+                           do_dark,
+                           dark,
+                           do_dark,
+                           dark,
+                           do_flat,
+                           flat,
+                           do_solidangle,
+                           solidangle,
+                           do_polarization,
+                           polarization,
+                           do_absorption,
+                           absorption,
+                           do_mask,
+                           mask,
+                           do_dummy,
+                           dummy,
+                           delta_dummy,
+                           normalization_factor);
+        output[i] = result;
+    };//end if NIMAGE
+};//end kernel
+
+
+/**
+ * \brief Performs Normalization of input image with float3 output (signal, variance, normalization)
+ *
+ * Intensities of images are corrected by:
+ *  - dark (read-out) noise subtraction for the data
+ *  - Solid angle correction (division)
+ *  - polarization correction (division)
+ *  - flat fiels correction (division)
+ * Corrections are made in place unless the pixel is dummy.
+ * Dummy pixels are left untouched so that they remain dummy
+ *
+ * - image              Float pointer to global memory storing the input image.
+ * - do_dark         Bool/int: shall dark-current correction be applied ?
+ * - dark            Float pointer to global memory storing the dark image.
+ * - do_flat         Bool/int: shall flat-field correction be applied ?
+ * - flat            Float pointer to global memory storing the flat image.
+ * - do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * - solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * - do_polarization Bool/int: shall flat-field correction be applied ?
+ * - polarization    Float pointer to global memory storing the polarization of each pixel.
+ * - do_dummy          Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * - dummy             Float: value for bad pixels
+ * - delta_dummy       Float: precision for bad pixel value
+ * - normalization_factor : divide the input by this value
+ *
+ *
+**/
+
+__kernel void
+corrections3(const __global float  *image,
+             const __global float  *variance,
+             const          char   do_dark,
+             const __global float  *dark,
+             const          char   do_dark_variance,
+             const __global float  *dark_variance,
+             const          char   do_flat,
+             const __global float  *flat,
+             const          char   do_solidangle,
+             const __global float  *solidangle,
+             const          char   do_polarization,
+             const __global float  *polarization,
+             const          char   do_absorption,
+             const __global float  *absorption,
+             const          char   do_mask,
+             const __global char   *mask,
+             const          char   do_dummy,
+             const          float  dummy,
+             const          float  delta_dummy,
+             const          float  normalization_factor,
+                   __global float3  *output
+            )
+{
+    size_t i= get_global_id(0);
+    float3 result = (float3)(0.0, 0.0, 0.0);
+    if (i < NIMAGE)
+    {
+        result = _preproc3( image,
+                            variance,
+                            do_dark,
+                            dark,
+                            do_dark_variance,
+                            dark_variance,
+                            do_flat,
+                            flat,
+                            do_solidangle,
+                            solidangle,
+                            do_polarization,
+                            polarization,
+                            do_absorption,
+                            absorption,
+                            do_mask,
+                            mask,
+                            do_dummy,
+                            dummy,
+                            delta_dummy,
+                            normalization_factor);
+        output[i] = result;
+    };//end if NIMAGE
+};//end kernel
+
+
author	Picca Frédéric-Emmanuel <picca@synchrotron-soleil.fr>	2017-08-18 14:48:52 +0200
committer	Picca Frédéric-Emmanuel <picca@synchrotron-soleil.fr>	2017-08-18 14:48:52 +0200
commit	f7bdc2acff3c13a6d632c28c4569690ab106eed7 (patch)
tree	9d67cdb7152ee4e711379e03fe0546c7c3b97303 /silx/resources/opencl