babl: add u8 code paths to trc/matrix space conversion

This is relevant for ARM, on modern x86_64 at least it is cheaper to convert to/from float with external loops.
author: Øyvind Kolås <pippin@gimp.org> 2022-01-23 05:21:21 +0100
committer: Øyvind Kolås <pippin@gimp.org> 2022-01-23 05:21:21 +0100
commit: 1eec1880691f66915c64f939008f5aff164933b1 (patch)
tree: fbdecb17edacac7a517a4bfebd5a75735045caaf /babl/base
parent: 950ad3faac1d12f47ed4cd0f70a07ed99ac11e25 (diff)
3 files changed, 167 insertions, 57 deletions
diff --git a/babl/base/babl-rgb-converter.c b/babl/base/babl-rgb-converter.c
index 0f7e2fb..be0159a 100644
--- a/babl/base/babl-rgb-converter.c
+++ b/babl/base/babl-rgb-converter.c
@@ -3,8 +3,6 @@
 #include "base/util.h"
 #include "babl-trc.h"
 #include "babl-base.h"
-///////////////////
-
 
 static void
 prep_conversion (const Babl *babl)
@@ -60,9 +58,26 @@ prep_conversion (const Babl *babl)
   } \
 }while(0)
 
-#define TRC_OUT(rgba_in, rgba_out)  do{\
+#define TRC_IN_u8(rgba_in, rgba_out)  do{ \
+  if ((source_space->space.trc[0] == source_space->space.trc[1]) && \
+      (source_space->space.trc[1] == source_space->space.trc[2])) \
+  { \
+    const Babl *trc = (void*)source_space->space.trc[0]; \
+    _babl_trc_to_linear_buf_u8_generic(trc, rgba_in, rgba_out, 4, 4, 3, samples); \
+  } \
+  else \
   { \
     unsigned int c; \
+    for (c = 0; c < 3; c ++) \
+    { \
+      const Babl *trc = (void*)source_space->space.trc[c]; \
+      _babl_trc_to_linear_buf_u8_generic (trc, rgba_in + c, rgba_out + c, 4, 4, 1, samples); \
+    } \
+  } \
+}while(0)
+
+#define TRC_OUT(rgba_in, rgba_out)  do{\
+  { \
     if ((destination_space->space.trc[0] == destination_space->space.trc[1]) && \
         (destination_space->space.trc[1] == destination_space->space.trc[2])) \
     { \
@@ -71,6 +86,7 @@ prep_conversion (const Babl *babl)
     } \
     else \
     { \
+      unsigned int c; \
       for (c = 0; c < 3; c ++) \
       { \
         const Babl *trc = (void*)destination_space->space.trc[c]; \
@@ -81,6 +97,26 @@ prep_conversion (const Babl *babl)
 } while(0)
 
 
+#define TRC_OUT_u8(rgba_in, rgba_out)  do{\
+  { \
+    if ((destination_space->space.trc[0] == destination_space->space.trc[1]) && \
+        (destination_space->space.trc[1] == destination_space->space.trc[2])) \
+    { \
+      const Babl *trc = (void*)destination_space->space.trc[0]; \
+      _babl_trc_from_linear_buf_u8_generic(trc, rgba_in, rgba_out, 4, 4, 3, samples); \
+    } \
+    else \
+    { \
+      unsigned int c; \
+      for (c = 0; c < 3; c ++) \
+      { \
+        const Babl *trc = (void*)destination_space->space.trc[c]; \
+        _babl_trc_from_linear_buf_u8_generic(trc, rgba_in + c, rgba_out + c, 4, 4, 1, samples); \
+      } \
+    } \
+  }\
+} while(0)
+
 
 
 static inline void
@@ -146,38 +182,18 @@ universal_nonlinear_rgba_u8_converter (const Babl    *conversion,
                                        void          *data)
 {
   const Babl *destination_space = conversion->conversion.destination->format.space;
-
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
   float * matrixf = data;
-  float * in_trc_lut_red = matrixf + 9;
-  float * in_trc_lut_green  = in_trc_lut_red + 256;
-  float * in_trc_lut_blue  = in_trc_lut_green + 256;
-  unsigned int i;
   uint8_t *rgba_in_u8 = (void*)src_char;
   uint8_t *rgba_out_u8 = (void*)dst_char;
-
-  float rgb[4*samples];
-
-  for (i = 0; i < samples; i++)
+  float rgba[4*samples];
+  for (int i = 0; i < samples * 4; i+=4)
   {
-    rgb[i*4+0]=in_trc_lut_red[rgba_in_u8[i*4+0]];
-    rgb[i*4+1]=in_trc_lut_green[rgba_in_u8[i*4+1]];
-    rgb[i*4+2]=in_trc_lut_blue[rgba_in_u8[i*4+2]];
-    rgba_out_u8[i*4+3] = rgba_in_u8[i*4+3];
-  }
-
-  babl_matrix_mul_vectorff_buf4 (matrixf, rgb, rgb, samples);
-
-  {
-    const Babl *from_trc_red   = (void*)destination_space->space.trc[0];
-    const Babl *from_trc_green = (void*)destination_space->space.trc[1];
-    const Babl *from_trc_blue  = (void*)destination_space->space.trc[2];
-    for (i = 0; i < samples * 4; i+=4)
-    {
-      rgba_out_u8[i+0] = babl_trc_from_linear (from_trc_red,   rgb[i+0]) * 255.5f;
-      rgba_out_u8[i+1] = babl_trc_from_linear (from_trc_green, rgb[i+1]) * 255.5f;
-      rgba_out_u8[i+2] = babl_trc_from_linear (from_trc_blue,  rgb[i+2]) * 255.5f;
-    }
+    rgba_out_u8[i+3] = rgba_in_u8[i+3];
   }
+  TRC_IN_u8(rgba_in_u8, rgba);
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba, rgba, samples);
+  TRC_OUT_u8(rgba, rgba_out_u8);
 }
 
 
@@ -255,7 +271,7 @@ universal_nonlinear_rgb_u8_converter (const Babl    *conversion,
     rgba_out[i*4+0]=in_trc_lut_red[rgb_in_u8[i*3+0]];
     rgba_out[i*4+1]=in_trc_lut_green[rgb_in_u8[i*3+1]];
     rgba_out[i*4+2]=in_trc_lut_blue[rgb_in_u8[i*3+2]];
-    rgba_out[i*4+3]=rgb_in_u8[i*3+2] * 255.5f;
+    rgba_out[i*4+3]=rgb_in_u8[i*3+2] * 255.0f;
   }
 
   babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
@@ -265,7 +281,7 @@ universal_nonlinear_rgb_u8_converter (const Babl    *conversion,
 
     for (i = 0; i < samples; i++)
       for (unsigned int c = 0; c < 3; c ++)
-        rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255.5f;
+        rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255.0f;
   }
 
 }
@@ -301,7 +317,6 @@ static inline void babl_matrix_mul_vectorff_buf4_sse2 (const float *mat,
 
 #undef m
 
-
 static inline void
 universal_nonlinear_rgba_converter_sse2 (const Babl    *conversion,
                                          unsigned char *__restrict__ src_char,
@@ -344,36 +359,24 @@ universal_nonlinear_rgba_u8_converter_sse2 (const Babl    *conversion,
                                             long           samples,
                                             void          *data)
 {
+  const Babl *source_space = conversion->conversion.source->format.space;
   const Babl *destination_space = conversion->conversion.destination->format.space;
 
   float * matrixf = data;
-  float * in_trc_lut_red = matrixf + 9;
-  float * in_trc_lut_green = in_trc_lut_red + 256;
-  float * in_trc_lut_blue = in_trc_lut_green + 256;
-  unsigned int i;
   uint8_t *rgba_in_u8 = (void*)src_char;
   uint8_t *rgba_out_u8 = (void*)dst_char;
 
-  float rgba_out[4*samples];
+  float rgba[4*samples];
 
-  for (i = 0; i < samples * 4; i+= 4)
+  for (int i = 0; i < samples*4; i+=4)
   {
-    rgba_out[i+0]=in_trc_lut_red[rgba_in_u8[i+0]];
-    rgba_out[i+1]=in_trc_lut_green[rgba_in_u8[i+1]];
-    rgba_out[i+2]=in_trc_lut_blue[rgba_in_u8[i+2]];
     rgba_out_u8[i+3] = rgba_in_u8[i+3];
   }
+  TRC_IN_u8(rgba_in_u8, rgba);
 
-  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
-
-  {
-    int c;
-    TRC_OUT(rgba_out, rgba_out);
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba, rgba, samples);
 
-    for (i = 0; i < samples * 4; i+= 4)
-      for (c = 0; c < 3; c ++)
-        rgba_out_u8[i+c] = rgba_out[i+c] * 255.5f;
-  }
+  TRC_OUT_u8(rgba, rgba_out_u8);
 }
 
 static inline void
@@ -409,7 +412,7 @@ universal_nonlinear_rgb_u8_converter_sse2 (const Babl    *conversion,
 
     for (i = 0; i < samples; i++)
       for (unsigned c = 0; c < 3; c ++)
-        rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255.5f;
+        rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255 + 0.5f;
   }
 }
 
@@ -530,6 +533,7 @@ add_rgb_adapter (Babl *babl,
     else
 #endif
     {
+#if 1
        prep_conversion(babl_conversion_new(
                        babl_format_with_space("RGBA float", space),
                        babl_format_with_space("RGBA float", babl),
@@ -551,7 +555,8 @@ add_rgb_adapter (Babl *babl,
                        babl_format_with_space("R'G'B'A float", space),
                        "linear", universal_nonlinear_rgba_converter,
                        NULL));
-
+#endif
+#if 1
        prep_conversion(babl_conversion_new(
                        babl_format_with_space("R'G'B'A float", space),
                        babl_format_with_space("RGBA float", babl),
@@ -562,6 +567,7 @@ add_rgb_adapter (Babl *babl,
                        babl_format_with_space("RGBA float", space),
                        "linear", universal_nonlinear_rgb_linear_converter,
                        NULL));
+#endif
 
        prep_conversion(babl_conversion_new(
                        babl_format_with_space("R'G'B'A u8", space),
@@ -573,7 +579,7 @@ add_rgb_adapter (Babl *babl,
                        babl_format_with_space("R'G'B'A u8", space),
                        "linear", universal_nonlinear_rgba_u8_converter,
                        NULL));
-
+#if 1
        prep_conversion(babl_conversion_new(
                        babl_format_with_space("R'G'B' u8", space),
                        babl_format_with_space("R'G'B' u8", babl),
@@ -595,8 +601,8 @@ add_rgb_adapter (Babl *babl,
                        babl_format_with_space("R'G'B'A float", babl),
                        "linear", universal_linear_rgb_nonlinear_converter,
                        NULL));
+#endif
     }
-#if 0
     prep_conversion(babl_conversion_new(
                     babl_format_with_space("RGB float", space),
                     babl_format_with_space("RGB float", babl),
@@ -607,7 +613,6 @@ add_rgb_adapter (Babl *babl,
                     babl_format_with_space("RGB float", space),
                     "linear", universal_rgb_converter,
                     NULL));
-#endif
     prep_conversion(babl_conversion_new(
                     babl_format_with_space("Y float", space),
                     babl_format_with_space("Y float", babl),
diff --git a/babl/base/babl-trc.c b/babl/base/babl-trc.c
index 6cb4900..a2f1d6e 100644
--- a/babl/base/babl-trc.c
+++ b/babl/base/babl-trc.c
@@ -211,7 +211,7 @@ _babl_trc_formula_srgb_to_linear (const Babl *trc_,
   float e = trc->lut[5];
   float f = trc->lut[6];
 
-  if (x >= d)
+  if (x >= d)  // OPT can be reduced to be branchless
   {
     return _babl_trc_gamma_to_linear ((Babl *) trc, a * x + b) + e;
   }
@@ -364,6 +364,8 @@ _babl_trc_from_linear_buf_generic (const Babl  *trc_,
   }
 }
 
+
+
 static inline void _babl_trc_linear_buf (const Babl  *trc_,
                                          const float *__restrict__ in, 
                                          float       *__restrict__ out,
@@ -494,6 +496,8 @@ BABL_SIMD_SUFFIX (babl_trc_new) (const char *name,
 
   trc_db[i].fun_to_linear_buf = _babl_trc_to_linear_buf_generic;
   trc_db[i].fun_from_linear_buf = _babl_trc_from_linear_buf_generic;
+  trc_db[i].fun_from_linear_buf_u8 = _babl_trc_from_linear_buf_u8_generic;
+  trc_db[i].fun_to_linear_buf_u8 = _babl_trc_to_linear_buf_u8_generic;
 
   switch (trc_db[i].type)
   {
@@ -502,12 +506,14 @@ BABL_SIMD_SUFFIX (babl_trc_new) (const char *name,
       trc_db[i].fun_from_linear = _babl_trc_linear;
       trc_db[i].fun_from_linear_buf = _babl_trc_linear_buf;
       trc_db[i].fun_to_linear_buf = _babl_trc_linear_buf;
+      //trc_db[i].fun_to_linear_buf_u8 = _babl_trc_linear_buf_u8;
       break;
     case BABL_TRC_FORMULA_GAMMA:
       trc_db[i].fun_to_linear = _babl_trc_gamma_to_linear;
       trc_db[i].fun_from_linear = _babl_trc_gamma_from_linear;
       trc_db[i].fun_to_linear_buf = _babl_trc_gamma_to_linear_buf;
       trc_db[i].fun_from_linear_buf = _babl_trc_gamma_from_linear_buf;
+      //trc_db[i].fun_from_linear_buf_u8 = _babl_trc_gamma_from_linear_buf_u8;
 
       trc_db[i].poly_gamma_to_linear_x0 = POLY_GAMMA_X0;
       trc_db[i].poly_gamma_to_linear_x1 = POLY_GAMMA_X1;
@@ -582,6 +588,7 @@ BABL_SIMD_SUFFIX (babl_trc_new) (const char *name,
       trc_db[i].fun_to_linear = _babl_trc_srgb_to_linear;
       trc_db[i].fun_from_linear = _babl_trc_srgb_from_linear;
       trc_db[i].fun_from_linear_buf = _babl_trc_srgb_from_linear_buf;
+      //trc_db[i].fun_from_linear_buf_u8 = _babl_trc_srgb_from_linear_buf_u8;
       trc_db[i].fun_to_linear_buf = _babl_trc_srgb_to_linear_buf;
       break;
     case BABL_TRC_LUT:
diff --git a/babl/base/babl-trc.h b/babl/base/babl-trc.h
index 79b6891..4556c96 100644
--- a/babl/base/babl-trc.h
+++ b/babl/base/babl-trc.h
@@ -51,6 +51,13 @@ typedef struct
                                       int out_gap,
                                       int components,
                                       int count);
+  void           (*fun_to_linear_buf_u8)(const Babl *trc,
+                                      const uint8_t *in,
+                                      float *out,
+                                      int in_gap,
+                                      int out_gap,
+                                      int components,
+                                      int count);
   void           (*fun_from_linear_buf)(const Babl *trc,
                                       const float *in,
                                       float *out,
@@ -58,6 +65,13 @@ typedef struct
                                       int out_gap,
                                       int components,
                                       int count);
+  void           (*fun_from_linear_buf_u8)(const Babl *trc,
+                                      const float *in,
+                                      uint8_t *out,
+                                      int in_gap,
+                                      int out_gap,
+                                      int components,
+                                      int count);
   BablPolynomial   poly_gamma_to_linear;
   float            poly_gamma_to_linear_x0;
   float            poly_gamma_to_linear_x1;
@@ -67,6 +81,8 @@ typedef struct
   float           *lut;
   float           *inv_lut;
   char             name[128];
+  int valid_u8_lut;
+  float u8_lut[256];
 } BablTRC;
 
 static inline void babl_trc_from_linear_buf (const Babl *trc_,
@@ -79,6 +95,26 @@ static inline void babl_trc_from_linear_buf (const Babl *trc_,
   trc->fun_from_linear_buf (trc_, in, out, in_gap, out_gap, components, count);
 }
 
+static inline void babl_trc_from_linear_buf_u8 (const Babl *trc_,
+                                                const float *in, uint8_t *out,
+                                                int in_gap, int out_gap,
+                                                int components,
+                                                int count)
+{
+  BablTRC *trc = (void*)trc_;
+  trc->fun_from_linear_buf_u8 (trc_, in, out, in_gap, out_gap, components, count);
+}
+
+static inline void babl_trc_to_linear_buf_u8 (const Babl *trc_,
+                                           const uint8_t *in, float *out,
+                                           int in_gap, int out_gap,
+                                           int components,
+                                           int count)
+{
+  BablTRC *trc = (void*)trc_;
+  trc->fun_to_linear_buf_u8 (trc_, in, out, in_gap, out_gap, components, count);
+}
+
 static inline void babl_trc_to_linear_buf (const Babl *trc_,
                                            const float *in, float *out,
                                            int in_gap, int out_gap,
@@ -104,4 +140,66 @@ static inline float babl_trc_to_linear (const Babl *trc_, float value)
 void
 babl_trc_class_init_generic (void);
 
+
+static inline void 
+_babl_trc_from_linear_buf_u8_generic (const Babl  *trc_,
+                                      const float *__restrict__ in, 
+                                      uint8_t     *__restrict__ out,
+                                      int          in_gap, 
+                                      int          out_gap,
+                                      int          components,
+                                      int          count)
+{
+  BablTRC *trc = (void*)trc_;
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < 3; c ++)
+      {
+        int val = trc->fun_from_linear (trc_, in[4 * i + c]) * 255.0 + 0.5;
+        out[4 * i + c] = val > 255 ? 255 : val;
+      }
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+      {
+        int val = trc->fun_from_linear (trc_, in[in_gap * i + c]) * 255.0 + 0.5;
+        out[out_gap * i + c] = val > 255 ? 255 : 0;
+      }
+  }
+}
+
+static inline void 
+_babl_trc_to_linear_buf_u8_generic (const Babl    *trc_, 
+                                    const uint8_t *__restrict__ in, 
+                                    float         *__restrict__ out, 
+                                    int            in_gap, 
+                                    int            out_gap, 
+                                    int            components, 
+                                    int            count)
+{
+  BablTRC *trc = (void*)trc_;
+  if (!trc->valid_u8_lut)
+  {
+    for (int i = 0; i <= 255; i++)
+      trc->u8_lut[i] = trc->fun_to_linear (trc_, i/255.0f);
+    trc->valid_u8_lut=1;
+  }
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < 3; c ++)
+        out[4 * i + c] = trc->u8_lut[in[4 * i + c]];
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[out_gap * i + c] = trc->u8_lut[in[in_gap * i + c]];
+  }
+}
+
+
 #endif
author	Øyvind Kolås <pippin@gimp.org>	2022-01-23 05:21:21 +0100
committer	Øyvind Kolås <pippin@gimp.org>	2022-01-23 05:21:21 +0100
commit	1eec1880691f66915c64f939008f5aff164933b1 (patch)
tree	fbdecb17edacac7a517a4bfebd5a75735045caaf /babl/base
parent	950ad3faac1d12f47ed4cd0f70a07ed99ac11e25 (diff)