1 files changed, 53 insertions, 13 deletions
diff --git a/silx/resources/opencl/array_utils.cl b/silx/resources/opencl/array_utils.cl
index 60677dc..6f78921 100644
--- a/silx/resources/opencl/array_utils.cl
+++ b/silx/resources/opencl/array_utils.cl
@@ -1,7 +1,6 @@
 /**
  *  2D Memcpy for float* arrays,
- * replacing pyopencl "enqueue_copy" which does not return the expected result
- * when dealing with rectangular buffers.
+ * replacing pyopencl "enqueue_copy" which does not work for rectangular copies.
  * ALL THE SIZES/OFFSETS ARE SPECIFIED IN PIXELS, NOT IN BYTES.
  * In the (x, y) convention, x is the fast index (as in CUDA).
  *
@@ -15,19 +14,60 @@
  *
  */
 kernel void cpy2d(
-                    global float* dst,
-                    global float* src,
-                    int dst_width,
-                    int src_width,
-                    int2 dst_offset,
-                    int2 src_offset,
-                    int2 transfer_shape)
+    global float* dst,
+    global float* src,
+    int dst_width,
+    int src_width,
+    int2 dst_offset,
+    int2 src_offset,
+    int2 transfer_shape)
 {
-    int gidx = get_global_id(0),
-        gidy = get_global_id(1);
-    if (gidx < transfer_shape.x && gidy < transfer_shape.y)
-    {
+    int gidx = get_global_id(0), gidy = get_global_id(1);
+    if (gidx < transfer_shape.x && gidy < transfer_shape.y) {
         dst[(dst_offset.y + gidy)*dst_width + (dst_offset.x + gidx)] = src[(src_offset.y + gidy)*src_width + (src_offset.x + gidx)];
     }
 }
 
+
+// Looks like cfloat_t and cfloat_mul are not working, yet specified in
+// pyopencl documentation. Here we are using float2 as in all available examples
+// #include <pyopencl-complex.h>
+// typedef cfloat_t complex;
+
+static inline float2 complex_mul(float2 a, float2 b) {
+    float2 res = (float2) (0, 0);
+    res.x = a.x * b.x - a.y * b.y;
+    res.y = a.y * b.x + a.x * b.y;
+    return res;
+}
+
+// arr2D *= arr1D (line by line, i.e along fast dim)
+kernel void inplace_complex_mul_2Dby1D(
+    global float2* arr2D,
+    global float2* arr1D,
+    int width,
+    int height)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if ((x >= width) || (y >= height)) return;
+    int i = y*width + x;
+    arr2D[i] = complex_mul(arr2D[i], arr1D[x]);
+}
+
+
+// arr3D *= arr1D (along fast dim)
+kernel void inplace_complex_mul_3Dby1D(
+    global float2* arr3D,
+    global float2* arr1D,
+    int width,
+    int height,
+    int depth)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+    if ((x >= width) || (y >= height) || (z >= depth)) return;
+    int i = (z*height + y)*width + x;
+    arr3D[i] = complex_mul(arr3D[i], arr1D[x]);
+}