2 files changed, 241 insertions, 51 deletions
diff --git a/silx/resources/opencl/doubleword.cl b/silx/resources/opencl/doubleword.cl
new file mode 100644
index 0000000..a0ebfda
--- /dev/null
+++ b/silx/resources/opencl/doubleword.cl
@@ -0,0 +1,115 @@
+/*
+ * OpenCL library for double word floating point calculation using compensated arithmetics
+ *
+ * The theoritical basis can be found in Valentina Popescu's PhD thesis:
+ * Towards fast and certified multi-precision libraries
+ * Reference LYSEN036
+ * http://www.theses.fr/2017LYSEN036
+ * All page number and equation number are refering to this document. 
+ * 
+ * The precision of the calculation (bounds) is provided in ULP (smallest possible mantissa) 
+ * and come from the table 2.2 (page 68 of the thesis).
+ * The number of equivalent FLOP is taken from the table 2.3 (page 69 the thesis). 
+ * Note that FLOP are not all equal: a division is much more expensive than an addition.
+ */
+
+//This library can be expanded to double-double by redefining fp, fp2 and one to double, double2 and 1.0.
+#ifdef DOUBLEDOUBLE
+#define fp double
+#define fp2 double2
+#define one 1.0
+#else
+#define fp float
+#define fp2 float2
+#define one 1.0f
+#endif
+
+/* Nota: i386 computer use x87 registers which are larger than the 32bits precision
+ * which can invalidate the error compensation mechanism.
+ *
+ * We use the trick to declare some variable "volatile" to enforce the actual
+ * precision reduction of those variables.
+*/
+
+#ifndef X87_VOLATILE
+# define X87_VOLATILE
+#endif
+
+//Algorithm 1, p23, theorem 1.1.12. Requires e_x > e_y, valid if |x| > |y| 
+inline fp2 fast_fp_plus_fp(fp x, fp y){
+    X87_VOLATILE fp s = x + y;
+    X87_VOLATILE fp z = s - x;
+    fp e = y - z;
+    return (fp2)(s, e);
+}
+
+//Algorithm 2, p24, same as fast_fp_plus_fp without the condition on e_x and e_y
+inline fp2 fp_plus_fp(fp x, fp y){
+    X87_VOLATILE fp s = x + y;
+    X87_VOLATILE fp xp = s - y;
+    X87_VOLATILE fp yp = s - xp;
+    X87_VOLATILE fp dx = x - xp;
+    X87_VOLATILE fp dy = y - yp;
+    return (fp2)(s, dx+dy);  
+}
+
+//Algorithm 3, p24: multiplication with a FMA
+inline fp2 fp_times_fp(fp x, fp y){
+    fp p = x * y;
+    fp e = fma(x, y, -p);
+    return (fp2)(p, e);  
+}
+
+//Algorithm 7, p38: Addition of a FP to a DW. 10flop bounds:2u²+5u³
+inline fp2 dw_plus_fp(fp2 x, fp y){
+    fp2 s = fp_plus_fp(x.s0, y);
+    X87_VOLATILE fp v = x.s1 + s.s1;
+    return fast_fp_plus_fp(s.s0, v);
+}
+
+//Algorithm 9, p40: addition of two DW: 20flop bounds:3u²+13u³
+inline fp2 dw_plus_dw(fp2 x, fp2 y){
+    fp2 s = fp_plus_fp(x.s0, y.s0);
+    fp2 t = fp_plus_fp(x.s1, y.s1);
+    fp2 v = fast_fp_plus_fp(s.s0, s.s1 + t.s0);
+    return fast_fp_plus_fp(v.s0, t.s1 + v.s1);
+}
+
+//Algorithm 12, p49: Multiplication FP*DW: 6flops bounds:2u²
+inline fp2 dw_times_fp(fp2 x, fp y){
+    fp2 c = fp_times_fp(x.s0, y);
+    return fast_fp_plus_fp(c.s0, fma(x.s1, y, c.s1));
+}
+
+//Algorithm 14, p52: Multiplication DW*DW, 8 flops bounds:6u²
+inline fp2 dw_times_dw(fp2 x, fp2 y){
+    fp2 c = fp_times_fp(x.s0, y.s0);
+    X87_VOLATILE fp l = fma(x.s1, y.s0, x.s0 * y.s1);
+    return fast_fp_plus_fp(c.s0, c.s1 + l);
+}
+
+//Algorithm 17, p55: Division DW / FP, 10flops bounds: 3.5u²
+inline fp2 dw_div_fp(fp2 x, fp y){
+    X87_VOLATILE fp th = x.s0 / y;
+    fp2 pi = fp_times_fp(th, y);
+    fp2 d = x - pi;
+    X87_VOLATILE fp delta = d.s0 + d.s1;
+    X87_VOLATILE fp tl = delta/y; 
+    return fast_fp_plus_fp(th, tl);
+}
+
+//Derived from algorithm 20, p64: Inversion 1/ DW, 22 flops
+inline fp2 inv_dw(fp2 y){
+    X87_VOLATILE fp th = one/y.s0;
+    X87_VOLATILE fp rh = fma(-y.s0, th, one);
+    X87_VOLATILE fp rl = -y.s1 * th;
+    fp2 e = fast_fp_plus_fp(rh, rl);
+    fp2 delta = dw_times_fp(e, th);
+    return dw_plus_fp(delta, th);
+}
+    
+//Algorithm 20, p64: Division DW / DW, 30 flops: bounds:9.8u²
+inline fp2 dw_div_dw(fp2 x, fp2 y){
+    return dw_times_dw(x, inv_dw(y));
+}
+
diff --git a/silx/resources/opencl/statistics.cl b/silx/resources/opencl/statistics.cl
index c7d98db..47d925b 100644
--- a/silx/resources/opencl/statistics.cl
+++ b/silx/resources/opencl/statistics.cl
@@ -3,11 +3,11 @@
  *
  *
  *
- *   Copyright (C) 2012-2017 European Synchrotron Radiation Facility
+ *   Copyright (C) 2012-2021 European Synchrotron Radiation Facility
  *                           Grenoble, France
  *
  *   Principal authors: J. Kieffer (kieffer@esrf.fr)
- *   Last revision: 13/12/2018
+ *   Last revision: 17/05/2021
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -33,10 +33,29 @@
  *
  * \brief OpenCL kernels for min, max, mean and std calculation
  *
- * Constant to be provided at build time:
+ * This module provides two functions to perform the `map` and the `reduce` 
+ * to be used with pyopencl reduction to calculate in a single pass the minimum, 
+ * maximum, sum, count, mean and standart deviation for an array.
+ * 
+ * So beside the reduction mechanisme from pyopencl, this algorithm implementes equations from
+ * https://dbs.ifi.uni-heidelberg.de/files/Team/eschubert/publications/SSDBM18-covariance-authorcopy.pdf
+ *
+ * let A and B be 2 disjoint partition of all elements
+ * 
+ * Omega_A = sum_{i \in A}(omaga_i) The sum of all weights
+ * V_A is the weighted sum of the signal over the partition
+ * VV_A is the weighted sum of deviation squarred
+ * 
+ * With this the mean is V / Omega and the variance equals VV / omega.
+ * 
+ * Redction operator performs:
+ * Omega_{AB} = Omega_A + Omega_B
+ * V_{AB} = V_A + V_B
+ * VV{AB} = VV_A + VV_B + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB})
+ *
+ * To avoid any numerical degradation, the doubleword library is used to perform all floating point operations. 
  *
  */
-
 #include "for_eclipse.h"
 
 /* \brief read a value at given position and initialize the float8 for the reduction
@@ -44,12 +63,12 @@
  * The float8 returned contains:
  * s0: minimum value
  * s1: maximum value
- * s2: count number of valid pixels
- * s3: count (error associated to)
- * s4: sum of valid pixels
- * s5: sum (error associated to)
- * s6: variance*count
- * s7: variance*count (error associated to)
+ * s2: Omega_h count number of valid pixels
+ * s3: Omega_l error associated to the count
+ * s4: V_h sum of signal
+ * s5: V_l error associated to the sum of signal
+ * s6: VVh variance*count
+ * s7: VVl error associated to variance*count
  *
  */
 static inline float8 map_statistics(global float* data, int position)
@@ -60,11 +79,12 @@ static inline float8 map_statistics(global float* data, int position)
     if (isfinite(value))
     {
         result = (float8)(value, value, 1.0f, 0.0f, value, 0.0f, 0.0f, 0.0f);
-        //                min     max   cnt   cnt_err  sum   sum_err M  M_err
+        //                min     max   cnt_h cnt_l sum_h  sum_l M2_h  M2_l
     }
     else
     {
         result = (float8)(FLT_MAX, -FLT_MAX, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
+        //                min        max     cnt_h cnt_l sum_h sum_l M2_h  M2_l
     }
     return result;
 }
@@ -89,7 +109,8 @@ static inline float8 map_statistics(global float* data, int position)
 static inline float8 reduce_statistics(float8 a, float8 b)
 {
     float2 sum_a, sum_b, M_a, M_b, count_a, count_b;
-
+    float2 count, sum;
+    float2 M, delta, delta2, omega3;
     //test on count
     if (a.s2 == 0.0f)
     {
@@ -113,39 +134,22 @@ static inline float8 reduce_statistics(float8 a, float8 b)
         M_b = (float2)(b.s6, b.s7);
     }
     // count = count_a + count_b
-    float2 count = compensated_sum(count_a, count_b);
+    count = dw_plus_dw(count_a, count_b);
     // sum = sum_a + sum_b
-    float2 sum = compensated_sum(sum_a, sum_b);
-
-    //delta = avg_b - avg_a
-    //delta^2 = avg_b^2 + avg_a^2 - 2*avg_b*avg_a
-    //coount_a*count_b*delta^2 = count_a/count_b * sum_b^2 + count_b/count_a*sum_a^2 - 2*sum_a*sum_b
-
-    //float2 sum2_a = compensated_mul(sum_a, sum_a);
-    //float2 sum2_b = compensated_mul(sum_b, sum_b);
-    //float2 ca_over_cb = compensated_mul(count_a, compensated_inv(count_b));
-    //float2 cb_over_ca = compensated_mul(count_b, compensated_inv(count_a));
-
-    //float2 delta2cbca = compensated_sum(compensated_sum(
-    //                    compensated_mul(ca_over_cb, sum2_b),
-    //                    compensated_mul(cb_over_ca, sum2_a)),
-    //                    -2.0f * compensated_mul(sum_a, sum_b));
-//////////////
-//    float2 delta = compensated_sum(
-//                  compensated_mul(sum_b, compensated_inv(count_b)),
-//              -1*(compensated_mul(sum_a, compensated_inv(count_a))));
-    float2 delta = compensated_sum(compensated_div(sum_b, count_b),
-                                   -1*compensated_div(sum_a, count_a));
-
-    float2 delta2cbca = compensated_mul(compensated_mul(delta, delta),
-                                        compensated_mul(count_a, count_b));
-    float2 M2 = compensated_sum(compensated_sum(M_a, M_b),
-                                compensated_mul(delta2cbca, compensated_inv(count)));
-    //M2 = M_a + M_b + delta ** 2 * count_a * count_b / (count_a + count_b)
+    sum = dw_plus_dw(sum_a, sum_b);
+    
+    // M2 = M_a + M_b + (Omega_A*V_B-Omega_B*V_A)² / (Omega_A * Omega_B * Omega_{AB})
+    M =  dw_plus_dw(M_a, M_b);
+    delta = dw_plus_dw(dw_times_dw(count_b, sum_a),
+                      -dw_times_dw(count_a, sum_b));
+    delta2 = dw_times_dw(delta, delta);
+    omega3 = dw_times_dw(count, dw_times_dw(count_a, count_b));            
+    M = dw_plus_dw(M, dw_div_dw(delta2, omega3));
+                                    
     float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1),
                              count.s0,        count.s1,
                              sum.s0,          sum.s1,
-                             M2.s0,           M2.s1);
+                             M.s0,            M.s1);
     return result;
 }
 
@@ -157,12 +161,12 @@ static inline float8 reduce_statistics(float8 a, float8 b)
  * The float8 used here contain contains:
  * s0: minimum value
  * s1: maximum value
- * s2: count number of valid pixels
- * s3: count (error associated to)
- * s4: sum of valid pixels
- * s5: sum (error associated to)
- * s6: M=variance*(count-1)
- * s7: M=variance*(count-1) (error associated to)
+ * s2: count number of valid pixels (major)
+ * s3: count number of valid pixels (minor)
+ * s4: sum of valid pixels (major)
+ * s5: sum of valid pixels (minor)
+ * s6: variance*count (major)
+ * s7: variance*count (minor)
  *
  */
 
@@ -194,10 +198,9 @@ static inline float8 reduce_statistics_simple(float8 a, float8 b)
     }
     float count = count_a + count_b;
     float sum = sum_a + sum_b;
-    float delta = sum_a/count_a - sum_b/count_b;
-    float delta2cbca = count_b * count_a * delta * delta;
-    float M2 = M_a + M_b + delta2cbca/count;
-    //M2 = M_a + M_b + delta ** 2 * count_a * count_b / (count_a + count_b)
+    float delta = sum_a*count_b - sum_b*count_a;
+    float delta2 = delta * delta;
+    float M2 = M_a + M_b + delta2/(count*count_a*count_b);
     float8 result = (float8)(min(a.s0, b.s0), max(a.s1, b.s1),
                              count,        0.0f,
                              sum,          0.0f,
@@ -206,3 +209,75 @@ static inline float8 reduce_statistics_simple(float8 a, float8 b)
 }
 
 
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+// unpack a double in two floats such as the  sum of the two is the double number
+static inline float2 unpack_double(double inp){
+    float major = (float) inp;
+    float minor = (float) (inp - major);
+    return (float2)(major, minor);  
+}
+
+// pack two floats into a double
+static inline double pack_double(float major, float minor){
+    return (double)major + (double)minor;
+}
+
+/* \brief reduction function associated to the statistics using double precision arithmetics
+ *
+ * this is described in:
+ * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+ *
+ * The float8 used here contain contains:
+ * s0: minimum value
+ * s1: maximum value
+ * s2: count number of valid pixels (major)
+ * s3: count number of valid pixels (minor)
+ * s4: sum of valid pixels (major)
+ * s5: sum of valid pixels (minor)
+ * s6: variance*count (major)
+ * s7: variance*count (minor)
+ *
+ */
+
+static inline float8 reduce_statistics_double(float8 a, float8 b)
+{
+    double sum_a, sum_b, M_a, M_b, count_a, count_b;
+
+    //test on count
+    if (a.s2 == 0.0)
+    {
+        return b;
+    }
+    else
+    {
+        count_a = pack_double(a.s2, a.s3);
+        sum_a = pack_double(a.s4,a.s5);
+        M_a = pack_double(a.s6, a.s7);
+    }
+    //test on count
+    if (b.s2 == 0.0)
+    {
+        return a;
+    }
+    else
+    {
+        count_b = pack_double(b.s2, b.s3);
+        sum_b = pack_double(b.s4, b.s5);
+        M_b = pack_double(b.s6, b.s7);
+    }
+    double count = count_a + count_b;
+    double sum = sum_a + sum_b;
+    double delta = sum_a*count_b - sum_b*count_a;
+    double delta2 = delta * delta;
+    double M2 = M_a + M_b + delta2/(count*count_a*count_b);
+    float8 result = (float8)((float2)(min(a.s0, b.s0), max(a.s1, b.s1)),
+                                unpack_double(count),
+                                unpack_double( sum),
+                                unpack_double( M2));
+    return result;
+}
+
+#endif
+\ No newline at end of file