19 files changed, 1355 insertions, 58 deletions
diff --git a/babl/base/babl-base.c b/babl/base/babl-base.c
index 1d93341..8b9cdde 100644
--- a/babl/base/babl-base.c
+++ b/babl/base/babl-base.c
@@ -25,19 +25,19 @@ static void types (void);
 static void models (void);
 
 void
-babl_base_init (void)
+BABL_SIMD_SUFFIX(babl_base_init) (void)
 {
   babl_hmpf_on_name_lookups++;
 
   types ();
   models ();
-  babl_formats_init ();
+  BABL_SIMD_SUFFIX (babl_formats_init) ();
 
   babl_hmpf_on_name_lookups--;
 }
 
 void
-babl_base_destroy (void)
+BABL_SIMD_SUFFIX(babl_base_destroy) (void)
 {
   /* done by the destruction of the elemental babl clases */
 }
@@ -50,12 +50,12 @@ babl_base_destroy (void)
 static void
 types (void)
 {
-  babl_base_type_float ();
-  babl_base_type_u15 ();
-  babl_base_type_half ();
-  babl_base_type_u8 ();
-  babl_base_type_u16 ();
-  babl_base_type_u32 ();
+  BABL_SIMD_SUFFIX (babl_base_type_float) ();
+  BABL_SIMD_SUFFIX (babl_base_type_u15) ();
+  BABL_SIMD_SUFFIX (babl_base_type_half) ();
+  BABL_SIMD_SUFFIX (babl_base_type_u8) ();
+  BABL_SIMD_SUFFIX (babl_base_type_u16) ();
+  BABL_SIMD_SUFFIX (babl_base_type_u32) ();
 }
 
 /*
@@ -67,9 +67,9 @@ static void
 models (void)
 {
   babl_hmpf_on_name_lookups--;
-  babl_base_model_rgb ();
-  babl_base_model_gray ();
-  babl_base_model_cmyk ();
+  BABL_SIMD_SUFFIX (babl_base_model_rgb) ();
+  BABL_SIMD_SUFFIX (babl_base_model_gray) ();
+  BABL_SIMD_SUFFIX (babl_base_model_cmyk) ();
   babl_hmpf_on_name_lookups++;
-  babl_base_model_ycbcr ();
+  BABL_SIMD_SUFFIX (babl_base_model_ycbcr) ();
 }
diff --git a/babl/base/babl-base.h b/babl/base/babl-base.h
index 64f1667..291697b 100644
--- a/babl/base/babl-base.h
+++ b/babl/base/babl-base.h
@@ -19,22 +19,36 @@
 #ifndef _BABL_BASE_H
 #define _BABL_BASE_H
 
+#ifdef ARM_NEON
+#define BABL_SIMD_SUFFIX(symbol) symbol##_arm_neon
+#else
+#ifdef X86_64_V2
+#define BABL_SIMD_SUFFIX(symbol) symbol##_x86_64_v2
+#else 
+#ifdef X86_64_V3
+#define BABL_SIMD_SUFFIX(symbol) symbol##_x86_64_v3
+#else
+#define BABL_SIMD_SUFFIX(symbol) symbol##_generic
+#endif
+#endif
+#endif
+
+extern void (*babl_base_init)    (void);
 
-void babl_base_init (void);
-void babl_base_destroy (void);
-void babl_formats_init (void);
+void BABL_SIMD_SUFFIX(babl_base_init)    (void);
+void BABL_SIMD_SUFFIX(babl_base_destroy) (void);
+void BABL_SIMD_SUFFIX(babl_formats_init) (void);
 
-void babl_base_type_half   (void);
-void babl_base_type_float  (void);
-void babl_base_type_u8     (void);
-void babl_base_type_u16    (void);
-void babl_base_type_u15    (void);
-void babl_base_type_u32    (void);
+void BABL_SIMD_SUFFIX(babl_base_type_half) (void);
+void BABL_SIMD_SUFFIX(babl_base_type_float)  (void);
+void BABL_SIMD_SUFFIX(babl_base_type_u8)     (void);
+void BABL_SIMD_SUFFIX(babl_base_type_u16)    (void);
+void BABL_SIMD_SUFFIX(babl_base_type_u15)    (void);
+void BABL_SIMD_SUFFIX(babl_base_type_u32)    (void);
 
-void babl_base_model_pal   (void);
-void babl_base_model_rgb   (void);
-void babl_base_model_cmyk  (void);
-void babl_base_model_gray  (void);
-void babl_base_model_ycbcr (void);
+void BABL_SIMD_SUFFIX(babl_base_model_rgb)   (void);
+void BABL_SIMD_SUFFIX(babl_base_model_cmyk)  (void);
+void BABL_SIMD_SUFFIX(babl_base_model_gray)  (void);
+void BABL_SIMD_SUFFIX(babl_base_model_ycbcr) (void);
 
 #endif
diff --git a/babl/base/babl-rgb-converter.c b/babl/base/babl-rgb-converter.c
new file mode 100644
index 0000000..e0ba7c3
--- /dev/null
+++ b/babl/base/babl-rgb-converter.c
@@ -0,0 +1,536 @@
+#include "config.h"
+#include "babl-internal.h"
+#include "base/util.h"
+#include "babl-trc.h"
+#include "babl-base.h"
+
+static void
+prep_conversion (const Babl *babl)
+{
+  Babl *conversion = (void*) babl;
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
+  float *matrixf;
+  unsigned int i;
+  float *lut_red;
+  float *lut_green;
+  float *lut_blue;
+
+  double matrix[9];
+  babl_matrix_mul_matrix (
+     (conversion->conversion.destination)->format.space->space.XYZtoRGB,
+     (conversion->conversion.source)->format.space->space.RGBtoXYZ,
+     matrix);
+
+  matrixf = babl_calloc (sizeof (float), 9 + 256 * 3); // we leak this matrix , which is a singleton
+  babl_matrix_to_float (matrix, matrixf);
+  conversion->conversion.data = matrixf;
+
+  lut_red = matrixf + 9;
+  lut_green = lut_red + 256;
+  lut_blue = lut_green + 256;
+  for (i = 0; i < 256; i++)
+  {
+    lut_red[i] = babl_trc_to_linear (source_space->space.trc[0], i/255.0);
+    lut_green[i] = babl_trc_to_linear (source_space->space.trc[1], i/255.0);
+    lut_blue[i] = babl_trc_to_linear (source_space->space.trc[2], i/255.0);
+  }
+}
+
+#define TRC_IN(rgba_in, rgba_out)  do{ int i;\
+  for (i = 0; i < samples; i++) \
+  { \
+    rgba_out[i*4+3] = rgba_in[i*4+3]; \
+  } \
+  if ((source_space->space.trc[0] == source_space->space.trc[1]) && \
+      (source_space->space.trc[1] == source_space->space.trc[2])) \
+  { \
+    const Babl *trc = (void*)source_space->space.trc[0]; \
+    babl_trc_to_linear_buf(trc, rgba_in, rgba_out, 4, 4, 3, samples); \
+  } \
+  else \
+  { \
+    unsigned int c; \
+    for (c = 0; c < 3; c ++) \
+    { \
+      const Babl *trc = (void*)source_space->space.trc[c]; \
+      babl_trc_to_linear_buf(trc, rgba_in + c, rgba_out + c, 4, 4, 1, samples); \
+    } \
+  } \
+}while(0)
+
+#define TRC_OUT(rgba_in, rgba_out)  do{\
+  { \
+    if ((destination_space->space.trc[0] == destination_space->space.trc[1]) && \
+        (destination_space->space.trc[1] == destination_space->space.trc[2])) \
+    { \
+      const Babl *trc = (void*)destination_space->space.trc[0]; \
+      babl_trc_from_linear_buf(trc, rgba_in, rgba_out, 4, 4, 3, samples); \
+    } \
+    else \
+    { \
+      unsigned int c; \
+      for (c = 0; c < 3; c ++) \
+      { \
+        const Babl *trc = (void*)destination_space->space.trc[c]; \
+        babl_trc_from_linear_buf(trc, rgba_in + c, rgba_out + c, 4, 4, 1, samples); \
+      } \
+    } \
+  }\
+} while(0)
+
+
+static inline void
+universal_nonlinear_rgba_converter (const Babl    *conversion,
+                                    unsigned char *__restrict__ src_char,
+                                    unsigned char *__restrict__ dst_char,
+                                    long           samples,
+                                    void          *data)
+{
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
+  const Babl *destination_space = babl_conversion_get_destination_space (conversion);
+
+  float * matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  TRC_IN(rgba_in, rgba_out);
+
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
+
+  TRC_OUT(rgba_out, rgba_out);
+}
+
+static inline void
+universal_nonlinear_rgb_linear_converter (const Babl    *conversion,
+                                          unsigned char *__restrict__ src_char,
+                                          unsigned char *__restrict__ dst_char,
+                                          long           samples,
+                                          void          *data)
+{
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
+  float * matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  TRC_IN(rgba_in, rgba_out);
+
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
+}
+
+static inline void
+universal_linear_rgb_nonlinear_converter (const Babl    *conversion,
+                                          unsigned char *__restrict__ src_char,
+                                          unsigned char *__restrict__ dst_char,
+                                          long           samples,
+                                          void          *data)
+{
+  const Babl *destination_space = conversion->conversion.destination->format.space;
+  float * matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_in, rgba_out, samples);
+
+  TRC_OUT(rgba_out, rgba_out);
+}
+
+static inline void
+universal_rgba_converter (const Babl    *conversion,
+                          unsigned char *__restrict__ src_char,
+                          unsigned char *__restrict__ dst_char,
+                          long           samples,
+                          void          *data)
+{
+  float *matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_in, rgba_out, samples);
+}
+
+static inline void
+universal_rgb_converter (const Babl    *conversion,
+                         unsigned char *__restrict__ src_char,
+                         unsigned char *__restrict__ dst_char,
+                         long           samples,
+                         void          *data)
+{
+  float *matrixf = data;
+  float *rgb_in = (void*)src_char;
+  float *rgb_out = (void*)dst_char;
+
+  babl_matrix_mul_vectorff_buf3 (matrixf, rgb_in, rgb_out, samples);
+}
+
+
+static inline void
+universal_ya_converter (const Babl    *conversion,
+                        unsigned char *__restrict__ src_char,
+                        unsigned char *__restrict__ dst_char,
+                        long           samples,
+                        void          *data)
+{
+  memcpy (dst_char, src_char, samples * 4 * 2);
+}
+
+static inline void
+universal_y_converter (const Babl    *conversion,
+                       unsigned char *__restrict__ src_char,
+                       unsigned char *__restrict__ dst_char,
+                       long           samples,
+                       void          *data)
+{
+  memcpy (dst_char, src_char, samples * 4);
+}
+
+
+static inline void
+universal_nonlinear_rgb_u8_converter (const Babl    *conversion,
+                                      unsigned char *__restrict__ src_char,
+                                      unsigned char *__restrict__ dst_char,
+                                      long           samples,
+                                      void          *data)
+{
+  const Babl *destination_space = conversion->conversion.destination->format.space;
+
+  float * matrixf = data;
+  float * in_trc_lut_red = matrixf + 9;
+  float * in_trc_lut_green = in_trc_lut_red + 256;
+  float * in_trc_lut_blue = in_trc_lut_green + 256;
+  unsigned int i;
+  uint8_t *rgb_in_u8 = (void*)src_char;
+  uint8_t *rgb_out_u8 = (void*)dst_char;
+
+  float rgba_out[4*samples];
+
+  for (i = 0; i < samples; i++)
+  {
+    rgba_out[i*4+0]=in_trc_lut_red[rgb_in_u8[i*3+0]];
+    rgba_out[i*4+1]=in_trc_lut_green[rgb_in_u8[i*3+1]];
+    rgba_out[i*4+2]=in_trc_lut_blue[rgb_in_u8[i*3+2]];
+    rgba_out[i*4+3]=rgb_in_u8[i*3+2] * 255.0f;
+  }
+
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
+
+  {
+    TRC_OUT(rgba_out, rgba_out);
+
+    for (i = 0; i < samples; i++)
+      for (unsigned int c = 0; c < 3; c ++)
+        rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255.0f;
+  }
+
+}
+
+
+#if defined(USE_SSE2)
+
+#define m(matr, j, i)  matr[j*3+i]
+
+#include <emmintrin.h>
+
+static inline void babl_matrix_mul_vectorff_buf4_sse2 (const float *mat,
+                                                       const float *v_in,
+                                                       float       *v_out,
+                                                       unsigned int samples)
+{
+  const __v4sf m___0 = {m(mat, 0, 0), m(mat, 1, 0), m(mat, 2, 0), 0};
+  const __v4sf m___1 = {m(mat, 0, 1), m(mat, 1, 1), m(mat, 2, 1), 0};
+  const __v4sf m___2 = {m(mat, 0, 2), m(mat, 1, 2), m(mat, 2, 2), 1};
+  unsigned int i;
+  for (i = 0; i < samples; i ++)
+  {
+    __v4sf a, b, c = _mm_load_ps(&v_in[0]);
+    a = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(0,0,0,0));
+    b = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(1,1,1,1));
+    c = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(3,2,2,2));
+    _mm_store_ps (v_out, m___0 * a + m___1 * b + m___2 * c);
+    v_out += 4;
+    v_in  += 4;
+  }
+  _mm_empty ();
+}
+
+#undef m
+
+static inline void
+universal_nonlinear_rgba_converter_sse2 (const Babl    *conversion,
+                                         unsigned char *__restrict__ src_char,
+                                         unsigned char *__restrict__ dst_char,
+                                         long           samples,
+                                         void          *data)
+{
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
+  const Babl *destination_space = babl_conversion_get_destination_space (conversion);
+  float * matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  TRC_IN(rgba_in, rgba_out);
+
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
+
+  TRC_OUT(rgba_out, rgba_out);
+}
+
+
+static inline void
+universal_rgba_converter_sse2 (const Babl *conversion,
+                               unsigned char *__restrict__ src_char,
+                               unsigned char *__restrict__ dst_char,
+                               long samples,
+                               void *data)
+{
+  float *matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_in, rgba_out, samples);
+}
+
+static inline void
+universal_nonlinear_rgb_u8_converter_sse2 (const Babl    *conversion,
+                                           unsigned char *__restrict__ src_char,
+                                           unsigned char *__restrict__ dst_char,
+                                           long           samples,
+                                           void          *data)
+{
+  const Babl *destination_space = conversion->conversion.destination->format.space;
+
+  float * matrixf = data;
+  float * in_trc_lut_red = matrixf + 9;
+  float * in_trc_lut_green = in_trc_lut_red + 256;
+  float * in_trc_lut_blue = in_trc_lut_green + 256;
+  unsigned int i;
+  uint8_t *rgb_in_u8 = (void*)src_char;
+  uint8_t *rgb_out_u8 = (void*)dst_char;
+
+  // The alignment is necessary for SIMD intrinsics in babl_matrix_mul_vectorff_buf4_sse2()
+  float __attribute__ ((aligned (16))) rgba_out[4*samples];
+
+  for (i = 0; i < samples; i++)
+  {
+    rgba_out[i*4+0]=in_trc_lut_red[rgb_in_u8[i*3+0]];
+    rgba_out[i*4+1]=in_trc_lut_green[rgb_in_u8[i*3+1]];
+    rgba_out[i*4+2]=in_trc_lut_blue[rgb_in_u8[i*3+2]];
+  }
+
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
+
+  {
+    TRC_OUT(rgba_out, rgba_out);
+
+    for (i = 0; i < samples; i++)
+      for (unsigned c = 0; c < 3; c ++)
+        rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255 + 0.5f;
+  }
+}
+
+
+static inline void
+universal_nonlinear_rgb_linear_converter_sse2 (const Babl    *conversion,
+                                               unsigned char *__restrict__ src_char,
+                                               unsigned char *__restrict__ dst_char,
+                                               long           samples,
+                                               void          *data)
+{
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
+  float * matrixf = data;
+  float *rgba_in  = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  TRC_IN(rgba_in, rgba_out);
+
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
+}
+
+
+static inline void
+universal_linear_rgb_nonlinear_converter_sse2 (const Babl    *conversion,
+                                               unsigned char *__restrict__ src_char,
+                                               unsigned char *__restrict__ dst_char,
+                                               long           samples,
+                                               void          *data)
+{
+  const Babl *destination_space = conversion->conversion.destination->format.space;
+  float * matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_in, rgba_out, samples);
+
+  TRC_OUT(rgba_out, rgba_out);
+}
+#endif
+
+
+static int
+add_rgb_adapter (Babl *babl,
+                 void *space)
+{
+  if (babl != space)
+  {
+
+#if defined(USE_SSE2)
+    if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE) &&
+        (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2))
+    {
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", space),
+                       babl_format_with_space("RGBA float", babl),
+                       "linear", universal_rgba_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", babl),
+                       babl_format_with_space("RGBA float", space),
+                       "linear", universal_rgba_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", space),
+                       babl_format_with_space("R'G'B'A float", babl),
+                       "linear", universal_nonlinear_rgba_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", babl),
+                       babl_format_with_space("R'G'B'A float", space),
+                       "linear", universal_nonlinear_rgba_converter_sse2,
+                       NULL));
+
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", space),
+                       babl_format_with_space("RGBA float", babl),
+                       "linear", universal_nonlinear_rgb_linear_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", babl),
+                       babl_format_with_space("RGBA float", space),
+                       "linear", universal_nonlinear_rgb_linear_converter_sse2,
+                       NULL));
+
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", babl),
+                       babl_format_with_space("R'G'B'A float", space),
+                       "linear", universal_linear_rgb_nonlinear_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", space),
+                       babl_format_with_space("R'G'B'A float", babl),
+                       "linear", universal_linear_rgb_nonlinear_converter_sse2,
+                       NULL));
+
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B' u8", space),
+                       babl_format_with_space("R'G'B' u8", babl),
+                       "linear", universal_nonlinear_rgb_u8_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B' u8", babl),
+                       babl_format_with_space("R'G'B' u8", space),
+                       "linear", universal_nonlinear_rgb_u8_converter_sse2,
+                       NULL));
+    }
+    else
+#endif
+    {
+#if 1
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", space),
+                       babl_format_with_space("RGBA float", babl),
+                       "linear", universal_rgba_converter,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", babl),
+                       babl_format_with_space("RGBA float", space),
+                       "linear", universal_rgba_converter,
+                       NULL));
+
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", space),
+                       babl_format_with_space("R'G'B'A float", babl),
+                       "linear", universal_nonlinear_rgba_converter,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", babl),
+                       babl_format_with_space("R'G'B'A float", space),
+                       "linear", universal_nonlinear_rgba_converter,
+                       NULL));
+#endif
+#if 1
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", space),
+                       babl_format_with_space("RGBA float", babl),
+                       "linear", universal_nonlinear_rgb_linear_converter,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", babl),
+                       babl_format_with_space("RGBA float", space),
+                       "linear", universal_nonlinear_rgb_linear_converter,
+                       NULL));
+#endif
+
+#if 1
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B' u8", space),
+                       babl_format_with_space("R'G'B' u8", babl),
+                       "linear", universal_nonlinear_rgb_u8_converter,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B' u8", babl),
+                       babl_format_with_space("R'G'B' u8", space),
+                       "linear", universal_nonlinear_rgb_u8_converter,
+                       NULL));
+
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", babl),
+                       babl_format_with_space("R'G'B'A float", space),
+                       "linear", universal_linear_rgb_nonlinear_converter,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", space),
+                       babl_format_with_space("R'G'B'A float", babl),
+                       "linear", universal_linear_rgb_nonlinear_converter,
+                       NULL));
+#endif
+    }
+    prep_conversion(babl_conversion_new(
+                    babl_format_with_space("RGB float", space),
+                    babl_format_with_space("RGB float", babl),
+                    "linear", universal_rgb_converter,
+                    NULL));
+    prep_conversion(babl_conversion_new(
+                    babl_format_with_space("RGB float", babl),
+                    babl_format_with_space("RGB float", space),
+                    "linear", universal_rgb_converter,
+                    NULL));
+    prep_conversion(babl_conversion_new(
+                    babl_format_with_space("Y float", space),
+                    babl_format_with_space("Y float", babl),
+                    "linear", universal_y_converter,
+                    NULL));
+    prep_conversion(babl_conversion_new(
+                    babl_format_with_space("YaA float", babl),
+                    babl_format_with_space("YaA float", space),
+                    "linear", universal_ya_converter,
+                    NULL));
+    prep_conversion(babl_conversion_new(
+                    babl_format_with_space("YA float", babl),
+                    babl_format_with_space("YA float", space),
+                    "linear", universal_ya_converter,
+                    NULL));
+  }
+  return 0;
+}
+
+/* The first time a new Babl space is used - for creation of a fish, is when
+ * this function is called, it adds conversions hooks that provides its formats
+ * with conversions internally as well as for conversions to and from other RGB
+ * spaces.
+ */
+void
+BABL_SIMD_SUFFIX(_babl_space_add_universal_rgb) (const Babl *space);
+void
+BABL_SIMD_SUFFIX(_babl_space_add_universal_rgb) (const Babl *space)
+{
+  babl_space_class_for_each (add_rgb_adapter, (void*)space);
+}
diff --git a/babl/base/babl-trc.c b/babl/base/babl-trc.c
new file mode 100644
index 0000000..09beb07
--- /dev/null
+++ b/babl/base/babl-trc.c
@@ -0,0 +1,610 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2017 Øyvind Kolås.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <https://www.gnu.org/licenses/>.
+ */
+
+#define MAX_TRCS   100
+
+/* FIXME: choose parameters more intelligently */
+#define POLY_GAMMA_X0     (  0.5 / 255.0)
+#define POLY_GAMMA_X1     (254.5 / 255.0)
+#define POLY_GAMMA_DEGREE 6
+#define POLY_GAMMA_SCALE  2
+
+#include "config.h"
+#include "babl-internal.h"
+#include "babl-base.h"
+#include "base/util.h"
+
+static BablTRC trc_db[MAX_TRCS];
+
+static inline float 
+_babl_trc_linear (const Babl *trc_, 
+                  float       value)
+{
+  return value;
+}
+
+static inline float 
+babl_trc_lut_from_linear (const Babl *trc_, 
+                          float       x)
+{
+  BablTRC *trc = (void*)trc_;
+  int entry;
+  float ret, diff;
+
+  entry = x * (trc->lut_size-1);
+  diff =  ( (x * (trc->lut_size-1)) - entry);
+
+  if (entry >= trc->lut_size -1)
+  {
+    entry = trc->lut_size - 1;
+    diff = 0.0f;
+  }
+  else if (entry < 0) entry = 0;
+
+  if (diff > 0.0f)
+  {
+    ret = trc->inv_lut[entry] * (1.0f - diff) + trc->inv_lut[entry+1] * diff;
+  }
+  else
+  {
+    ret = trc->inv_lut[entry];
+  }
+  return ret;
+}
+
+static inline float 
+babl_trc_lut_to_linear (const Babl *trc_, 
+                        float       x)
+{
+  BablTRC *trc = (void*)trc_;
+  int entry;
+  float ret, diff;
+
+  entry = x * (trc->lut_size-1);
+  diff =  ( (x * (trc->lut_size-1)) - entry);
+
+  if (entry >= trc->lut_size) entry = trc->lut_size - 1;
+  else if (entry < 0) entry = 0;
+
+  if (diff > 0.0f && entry < trc->lut_size - 1)
+  {
+    ret = trc->lut[entry] * (1.0f - diff) + trc->lut[entry+1] * diff;
+  }
+  else
+  {
+    ret = trc->lut[entry];
+  }
+  return ret;
+}
+
+static inline float 
+_babl_trc_gamma_to_linear (const Babl *trc_, 
+                           float       value)
+{
+  BablTRC *trc = (void*)trc_;
+  if (value >= trc->poly_gamma_to_linear_x0 &&
+      value <= trc->poly_gamma_to_linear_x1)
+    {
+      return babl_polynomial_eval (&trc->poly_gamma_to_linear, value);
+    }
+  else if (value > 0.0f)
+    {
+      return powf (value, trc->gamma);
+    }
+  return 0.0f;
+}
+
+static inline float 
+_babl_trc_gamma_from_linear (const Babl *trc_, 
+                             float       value)
+{
+  BablTRC *trc = (void*)trc_;
+  if (value >= trc->poly_gamma_from_linear_x0 &&
+      value <= trc->poly_gamma_from_linear_x1)
+    {
+      return babl_polynomial_eval (&trc->poly_gamma_from_linear, value);
+    }
+  else if (value > 0.0f)
+    {
+      return powf (value, trc->rgamma);
+    }
+  return 0.0f;
+}
+
+static inline void 
+_babl_trc_gamma_to_linear_buf (const Babl  *trc_, 
+                               const float *__restrict__ in, 
+                               float       *__restrict__ out, 
+                               int          in_gap, 
+                               int          out_gap, 
+                               int          components, 
+                               int          count)
+{
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < 3; c ++)
+        out[4 * i + c] = _babl_trc_gamma_to_linear (trc_, in[4 *i + c]);
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[out_gap * i + c] = _babl_trc_gamma_to_linear (trc_, in[in_gap *i + c]);
+  }
+}
+
+static inline void 
+_babl_trc_gamma_from_linear_buf (const Babl  *trc_, 
+                                 const float *__restrict__ in, 
+                                 float       *__restrict__ out, 
+                                 int          in_gap, 
+                                 int          out_gap, 
+                                 int          components, 
+                                 int          count)
+{
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < 3; c ++)
+        out[4 * i + c] = _babl_trc_gamma_from_linear (trc_, in[4 *i + c]);
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[out_gap * i + c] = _babl_trc_gamma_from_linear (trc_, in[in_gap *i + c]);
+  }
+}
+
+static inline float 
+_babl_trc_formula_srgb_from_linear (const Babl *trc_, 
+                                    float       value)
+{
+  BablTRC *trc = (void*)trc_;
+  float x= value;
+  float a = trc->lut[1];
+  float b = trc->lut[2];
+  float c = trc->lut[3];
+  float d = trc->lut[4];
+  float e = trc->lut[5];
+  float f = trc->lut[6];
+
+  if (x - f > c * d)  // XXX: verify that this math is the correct inverse
+  {
+    float v = _babl_trc_gamma_from_linear ((Babl *) trc, x - f);
+    v = (v-b)/a;
+    if (v < 0.0f || v >= 0.0f)
+      return v;
+    return 0.0f;
+  }
+  if (c > 0.0f)
+    return (x - e) / c;
+  return 0.0f;
+}
+
+static inline float 
+_babl_trc_formula_srgb_to_linear (const Babl *trc_, 
+                                  float       value)
+{
+  BablTRC *trc = (void*)trc_;
+  float x= value;
+  float a = trc->lut[1];
+  float b = trc->lut[2];
+  float c = trc->lut[3];
+  float d = trc->lut[4];
+  float e = trc->lut[5];
+  float f = trc->lut[6];
+
+  if (x >= d)  // OPT can be reduced to be branchless
+  {
+    return _babl_trc_gamma_to_linear ((Babl *) trc, a * x + b) + e;
+  }
+  return c * x + f;
+}
+static inline float 
+_babl_trc_formula_cie_from_linear (const Babl *trc_, 
+                                   float       value)
+{
+  BablTRC *trc = (void*)trc_;
+  float x= value;
+  float a = trc->lut[1];
+  float b = trc->lut[2];
+  float c = trc->lut[3];
+
+  if (x > c)
+  {
+    float v = _babl_trc_gamma_from_linear ((Babl *) trc, x - c);
+    v = (v-b)/a;
+    if (v < 0.0f || v >= 0.0f)
+      return v;
+  }
+  return 0.0f;
+}
+
+static inline float 
+_babl_trc_formula_cie_to_linear (const Babl *trc_, 
+                                 float       value)
+{
+  BablTRC *trc = (void*)trc_;
+  float x= value;
+  float a = trc->lut[1];
+  float b = trc->lut[2];
+  float c = trc->lut[3];
+
+  if (x >= -b / a)
+  {
+    return _babl_trc_gamma_to_linear ((Babl *) trc, a * x + b) + c;
+  }
+  return c;
+}
+
+
+
+static inline float 
+_babl_trc_srgb_to_linear (const Babl *trc_, 
+                          float       value)
+{
+  return babl_gamma_2_2_to_linearf (value);
+}
+
+static inline float 
+_babl_trc_srgb_from_linear (const Babl *trc_, 
+                            float       value)
+{
+  return babl_linear_to_gamma_2_2f (value);
+}
+
+static inline void 
+_babl_trc_srgb_to_linear_buf (const Babl  *trc_, 
+                              const float *in, 
+                              float       *out, 
+                              int          in_gap, 
+                              int          out_gap, 
+                              int          components, 
+                              int          count)
+{
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+  for (int i = 0; i < count; i ++)
+    for (int c = 0; c < 3; c++)
+      out[4 * i + c] = babl_gamma_2_2_to_linearf (in[4 * i + c]);
+  }
+  else
+  {
+  for (int i = 0; i < count; i ++)
+    for (int c = 0; c < components; c++)
+      out[out_gap * i + c] = babl_gamma_2_2_to_linearf (in[in_gap * i + c]);
+  }
+}
+
+static inline void 
+_babl_trc_srgb_from_linear_buf (const Babl  *trc_,
+                                const float *in, 
+                                float       *out,
+                                int          in_gap,
+                                int          out_gap,
+                                int          components,
+                                int          count)
+{
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+      for (int i = 0; i < count; i ++)
+       for (int c = 0; c < 3; c++)
+         out[4 * i + c] = babl_linear_to_gamma_2_2f (in[4 * i + c]);
+  }
+  else
+  {
+     for (int i = 0; i < count; i ++)
+       for (int c = 0; c < components; c++)
+         out[out_gap * i + c] = babl_linear_to_gamma_2_2f (in[in_gap * i + c]);
+  }
+}
+
+static inline void 
+_babl_trc_to_linear_buf_generic (const Babl  *trc_, 
+                                 const float *__restrict__ in, 
+                                 float       *__restrict__ out, 
+                                 int          in_gap, 
+                                 int          out_gap, 
+                                 int          components, 
+                                 int          count)
+{
+  BablTRC *trc = (void*)trc_;
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < 3; c ++)
+        out[4 * i + c] = trc->fun_to_linear (trc_, in[4 * i + c]);
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[out_gap * i + c] = trc->fun_to_linear (trc_, in[in_gap * i + c]);
+  }
+}
+
+static inline void 
+_babl_trc_from_linear_buf_generic (const Babl  *trc_,
+                                   const float *__restrict__ in, 
+                                   float       *__restrict__ out,
+                                   int          in_gap, 
+                                   int          out_gap,
+                                   int          components,
+                                   int          count)
+{
+  BablTRC *trc = (void*)trc_;
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < 3; c ++)
+        out[4 * i + c] = trc->fun_from_linear (trc_, in[4 * i + c]);
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[out_gap * i + c] = trc->fun_from_linear (trc_, in[in_gap * i + c]);
+  }
+}
+
+
+
+static inline void _babl_trc_linear_buf (const Babl  *trc_,
+                                         const float *__restrict__ in, 
+                                         float       *__restrict__ out,
+                                         int          in_gap, 
+                                         int          out_gap,
+                                         int          components,
+                                         int          count)
+{
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+     for (int i = 0; i < count; i ++)
+       for (int c = 0; c < 3; c ++)
+         out[i * 4 + c] = in[i * 4 + c];
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[i * out_gap + c] = in[i * in_gap + c];
+  }
+}
+
+const Babl *
+BABL_SIMD_SUFFIX (babl_trc_lookup_by_name) (const char *name);
+
+const Babl *
+BABL_SIMD_SUFFIX (babl_trc_lookup_by_name) (const char *name)
+{
+  int i;
+  for (i = 0; trc_db[i].instance.class_type; i++)
+    if (!strcmp (trc_db[i].instance.name, name))
+    {
+      return (Babl*)&trc_db[i];
+    }
+  babl_log("failed to find trc '%s'\n", name);
+  return NULL;
+}
+
+const Babl *
+BABL_SIMD_SUFFIX (babl_trc_new) (const char *name,
+              BablTRCType type,
+              double      gamma,
+              int         n_lut,
+              float      *lut);
+
+const Babl *
+BABL_SIMD_SUFFIX (babl_trc_new) (const char *name,
+              BablTRCType type,
+              double      gamma,
+              int         n_lut,
+              float      *lut)
+{
+  int i=0;
+  static BablTRC trc;
+  trc.instance.class_type = BABL_TRC;
+  trc.instance.id         = 0;
+  trc.type = type;
+  trc.gamma  = gamma > 0.0    ? gamma       : 0.0;
+  trc.rgamma = gamma > 0.0001 ? 1.0 / gamma : 0.0;
+
+  if (n_lut )
+  {
+    for (i = 0; trc_db[i].instance.class_type; i++)
+    {
+    if ( trc_db[i].lut_size == n_lut &&
+         (memcmp (trc_db[i].lut, lut, sizeof (float) * n_lut)==0)
+       )
+      {
+        return (void*)&trc_db[i];
+      }
+    }
+  }
+  else
+  for (i = 0; trc_db[i].instance.class_type; i++)
+  {
+    int offset = ((char*)&trc_db[i].type) - (char*)(&trc_db[i]);
+    int size   = ((char*)&trc_db[i].gamma + sizeof(double)) - ((char*)&trc_db[i].type);
+
+    if (memcmp ((char*)(&trc_db[i]) + offset, ((char*)&trc) + offset, size)==0)
+      {
+        return (void*)&trc_db[i];
+      }
+  }
+  if (i >= MAX_TRCS-1)
+  {
+    babl_log ("too many BablTRCs");
+    return NULL;
+  }
+  trc_db[i]=trc;
+  trc_db[i].instance.name = trc_db[i].name;
+  if (name)
+    snprintf (trc_db[i].name, sizeof (trc_db[i].name), "%s", name);
+  else if (n_lut)
+    snprintf (trc_db[i].name, sizeof (trc_db[i].name), "lut-trc");
+  else
+    snprintf (trc_db[i].name, sizeof (trc_db[i].name), "trc-%i-%f", type, gamma);
+
+  if (n_lut)
+  {
+    int j;
+    trc_db[i].lut_size = n_lut;
+    trc_db[i].lut = babl_calloc (sizeof (float), n_lut);
+    memcpy (trc_db[i].lut, lut, sizeof (float) * n_lut);
+    trc_db[i].inv_lut = babl_calloc (sizeof (float), n_lut);
+
+    for (j = 0; j < n_lut; j++)
+    {
+      int k;
+      double min = 0.0;
+      double max = 1.0;
+      for (k = 0; k < 16; k++)
+      {
+        double guess = (min + max) / 2;
+        float reversed_index = babl_trc_lut_to_linear (BABL(&trc_db[i]), guess) * (n_lut-1.0f);
+
+        if (reversed_index < j)
+        {
+          min = guess;
+        }
+        else if (reversed_index > j)
+        {
+          max = guess;
+        }
+      }
+      trc_db[i].inv_lut[j] = (min + max) / 2;
+    }
+  }
+
+  trc_db[i].fun_to_linear_buf = _babl_trc_to_linear_buf_generic;
+  trc_db[i].fun_from_linear_buf = _babl_trc_from_linear_buf_generic;
+
+  switch (trc_db[i].type)
+  {
+    case BABL_TRC_LINEAR:
+      trc_db[i].fun_to_linear = _babl_trc_linear;
+      trc_db[i].fun_from_linear = _babl_trc_linear;
+      trc_db[i].fun_from_linear_buf = _babl_trc_linear_buf;
+      trc_db[i].fun_to_linear_buf = _babl_trc_linear_buf;
+      break;
+    case BABL_TRC_FORMULA_GAMMA:
+      trc_db[i].fun_to_linear = _babl_trc_gamma_to_linear;
+      trc_db[i].fun_from_linear = _babl_trc_gamma_from_linear;
+      trc_db[i].fun_to_linear_buf = _babl_trc_gamma_to_linear_buf;
+      trc_db[i].fun_from_linear_buf = _babl_trc_gamma_from_linear_buf;
+
+      trc_db[i].poly_gamma_to_linear_x0 = POLY_GAMMA_X0;
+      trc_db[i].poly_gamma_to_linear_x1 = POLY_GAMMA_X1;
+      babl_polynomial_approximate_gamma (&trc_db[i].poly_gamma_to_linear,
+                                         trc_db[i].gamma,
+                                         trc_db[i].poly_gamma_to_linear_x0,
+                                         trc_db[i].poly_gamma_to_linear_x1,
+                                         POLY_GAMMA_DEGREE, POLY_GAMMA_SCALE);
+
+      trc_db[i].poly_gamma_from_linear_x0 = POLY_GAMMA_X0;
+      trc_db[i].poly_gamma_from_linear_x1 = POLY_GAMMA_X1;
+      babl_polynomial_approximate_gamma (&trc_db[i].poly_gamma_from_linear,
+                                         trc_db[i].rgamma,
+                                         trc_db[i].poly_gamma_from_linear_x0,
+                                         trc_db[i].poly_gamma_from_linear_x1,
+                                         POLY_GAMMA_DEGREE, POLY_GAMMA_SCALE);
+      break;
+    case BABL_TRC_FORMULA_CIE:
+      trc_db[i].lut = babl_calloc (sizeof (float), 4);
+      {
+        int j;
+        for (j = 0; j < 4; j++)
+          trc_db[i].lut[j] = lut[j];
+      }
+      trc_db[i].fun_to_linear = _babl_trc_formula_cie_to_linear;
+      trc_db[i].fun_from_linear = _babl_trc_formula_cie_from_linear;
+
+      trc_db[i].poly_gamma_to_linear_x0 = lut[4];
+      trc_db[i].poly_gamma_to_linear_x1 = POLY_GAMMA_X1;
+      babl_polynomial_approximate_gamma (&trc_db[i].poly_gamma_to_linear,
+                                         trc_db[i].gamma,
+                                         trc_db[i].poly_gamma_to_linear_x0,
+                                         trc_db[i].poly_gamma_to_linear_x1,
+                                         POLY_GAMMA_DEGREE, POLY_GAMMA_SCALE);
+
+      trc_db[i].poly_gamma_from_linear_x0 = lut[3] * lut[4];
+      trc_db[i].poly_gamma_from_linear_x1 = POLY_GAMMA_X1;
+      babl_polynomial_approximate_gamma (&trc_db[i].poly_gamma_from_linear,
+                                         trc_db[i].rgamma,
+                                         trc_db[i].poly_gamma_from_linear_x0,
+                                         trc_db[i].poly_gamma_from_linear_x1,
+                                         POLY_GAMMA_DEGREE, POLY_GAMMA_SCALE);
+      break;
+
+    case BABL_TRC_FORMULA_SRGB:
+      trc_db[i].lut = babl_calloc (sizeof (float), 7);
+      {
+        int j;
+        for (j = 0; j < 7; j++)
+          trc_db[i].lut[j] = lut[j];
+      }
+      trc_db[i].fun_to_linear = _babl_trc_formula_srgb_to_linear;
+      trc_db[i].fun_from_linear = _babl_trc_formula_srgb_from_linear;
+
+      trc_db[i].poly_gamma_to_linear_x0 = lut[4];
+      trc_db[i].poly_gamma_to_linear_x1 = POLY_GAMMA_X1;
+      babl_polynomial_approximate_gamma (&trc_db[i].poly_gamma_to_linear,
+                                         trc_db[i].gamma,
+                                         trc_db[i].poly_gamma_to_linear_x0,
+                                         trc_db[i].poly_gamma_to_linear_x1,
+                                         POLY_GAMMA_DEGREE, POLY_GAMMA_SCALE);
+
+      trc_db[i].poly_gamma_from_linear_x0 = lut[3] * lut[4];
+      trc_db[i].poly_gamma_from_linear_x1 = POLY_GAMMA_X1;
+      babl_polynomial_approximate_gamma (&trc_db[i].poly_gamma_from_linear,
+                                         trc_db[i].rgamma,
+                                         trc_db[i].poly_gamma_from_linear_x0,
+                                         trc_db[i].poly_gamma_from_linear_x1,
+                                         POLY_GAMMA_DEGREE, POLY_GAMMA_SCALE);
+      break;
+    case BABL_TRC_SRGB:
+      trc_db[i].fun_to_linear = _babl_trc_srgb_to_linear;
+      trc_db[i].fun_from_linear = _babl_trc_srgb_from_linear;
+      trc_db[i].fun_from_linear_buf = _babl_trc_srgb_from_linear_buf;
+      trc_db[i].fun_to_linear_buf = _babl_trc_srgb_to_linear_buf;
+      break;
+    case BABL_TRC_LUT:
+      trc_db[i].fun_to_linear = babl_trc_lut_to_linear;
+      trc_db[i].fun_from_linear = babl_trc_lut_from_linear;
+      break;
+  }
+  return (Babl*)&trc_db[i];
+}
+
+void
+BABL_SIMD_SUFFIX(babl_trc_class_for_each) (BablEachFunction each_fun,
+                                           void            *user_data);
+
+void
+BABL_SIMD_SUFFIX(babl_trc_class_for_each) (BablEachFunction each_fun,
+                                           void            *user_data)
+{
+  int i=0;
+  for (i = 0; trc_db[i].instance.class_type; i++)
+    if (each_fun (BABL (&trc_db[i]), user_data))
+      return;
+}
+
diff --git a/babl/base/babl-trc.h b/babl/base/babl-trc.h
new file mode 100644
index 0000000..1901fd2
--- /dev/null
+++ b/babl/base/babl-trc.h
@@ -0,0 +1,110 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2017, Øyvind Kolås and others.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <https://www.gnu.org/licenses/>.
+ */
+
+#ifndef _BABL_TRC_H
+#define _BABL_TRC_H
+
+#include <math.h>
+#include <string.h>
+#include "base/util.h"
+#include "babl-polynomial.h"
+
+BABL_CLASS_DECLARE (trc);
+
+typedef enum {BABL_TRC_LINEAR,
+              BABL_TRC_FORMULA_GAMMA,
+              BABL_TRC_SRGB,
+              BABL_TRC_FORMULA_SRGB,
+              BABL_TRC_LUT,
+              BABL_TRC_FORMULA_CIE}
+BablTRCType;
+
+typedef struct
+{
+  BablInstance     instance;
+  BablTRCType      type;
+  int              lut_size;
+  double           gamma;
+  float            rgamma;
+  float          (*fun_to_linear)(const Babl *trc, float val);
+  float          (*fun_from_linear)(const Babl *trc, float val);
+
+  void           (*fun_to_linear_buf)(const Babl *trc,
+                                      const float *in,
+                                      float *out,
+                                      int in_gap,
+                                      int out_gap,
+                                      int components,
+                                      int count);
+  void           (*fun_from_linear_buf)(const Babl *trc,
+                                      const float *in,
+                                      float *out,
+                                      int in_gap,
+                                      int out_gap,
+                                      int components,
+                                      int count);
+  BablPolynomial   poly_gamma_to_linear;
+  float            poly_gamma_to_linear_x0;
+  float            poly_gamma_to_linear_x1;
+  BablPolynomial   poly_gamma_from_linear;
+  float            poly_gamma_from_linear_x0;
+  float            poly_gamma_from_linear_x1;
+  float           *lut;
+  float           *inv_lut;
+  char             name[128];
+  int valid_u8_lut;
+  float u8_lut[256];
+} BablTRC;
+
+static inline void babl_trc_from_linear_buf (const Babl *trc_,
+                                             const float *in, float *out,
+                                             int in_gap, int out_gap,
+                                             int components,
+                                             int count)
+{
+  BablTRC *trc = (void*)trc_;
+  trc->fun_from_linear_buf (trc_, in, out, in_gap, out_gap, components, count);
+}
+
+static inline void babl_trc_to_linear_buf (const Babl *trc_,
+                                           const float *in, float *out,
+                                           int in_gap, int out_gap,
+                                           int components,
+                                           int count)
+{
+  BablTRC *trc = (void*)trc_;
+  trc->fun_to_linear_buf (trc_, in, out, in_gap, out_gap, components, count);
+}
+
+static inline float babl_trc_from_linear (const Babl *trc_, float value)
+{
+  BablTRC *trc = (void*)trc_;
+  return trc->fun_from_linear (trc_, value);
+}
+
+static inline float babl_trc_to_linear (const Babl *trc_, float value)
+{
+  BablTRC *trc = (void*)trc_;
+  return trc->fun_to_linear (trc_, value);
+}
+
+void
+babl_trc_class_init_generic (void);
+
+
+#endif
diff --git a/babl/base/formats.c b/babl/base/formats.c
index bad9d14..03488c4 100644
--- a/babl/base/formats.c
+++ b/babl/base/formats.c
@@ -25,7 +25,7 @@
 #include "babl-base.h"
 
 void
-babl_formats_init (void)
+BABL_SIMD_SUFFIX (babl_formats_init) (void)
 {
   const Babl *types[]={
     babl_type_from_id (BABL_DOUBLE),
@@ -35,7 +35,7 @@ babl_formats_init (void)
     babl_type_from_id (BABL_U16),
     babl_type_from_id (BABL_U32)
   };
-  for (int i = 0; i < sizeof (types)/sizeof(types[0]);i++)
+  for (size_t i = 0; i < sizeof (types) / sizeof(types[0]); i++)
   {
     const Babl *type = types[i];
 
diff --git a/babl/base/meson.build b/babl/base/meson.build
index a78fd84..e59609c 100644
--- a/babl/base/meson.build
+++ b/babl/base/meson.build
@@ -14,10 +14,43 @@ babl_base_sources = [
   'type-u16.c',
   'type-u32.c',
   'type-u8.c',
+  'babl-trc.c',
+  'babl-rgb-converter.c',
 ]
 
 babl_base = static_library('babl_base',
   babl_base_sources,
   include_directories: [rootInclude, bablInclude],
   dependencies: [math, lcms],
-)
+   c_args: common_c_flags + [sse2_cflags]
+) 
+
+if host_cpu_family == 'x86_64'
+
+  babl_base_x86_64_v2 = static_library('babl_base-x86-64-v2',
+    babl_base_sources,
+    include_directories: [rootInclude, bablInclude],
+    dependencies: [math, lcms],
+    c_args: common_c_flags + x86_64_v2_flags
+  )
+
+  babl_base_x86_64_v3 = static_library('babl_base-x86-64-v3',
+    babl_base_sources,
+    include_directories: [rootInclude, bablInclude],
+    dependencies: [math, lcms],
+    c_args: common_c_flags + x86_64_v3_flags
+  )
+
+endif
+
+
+if host_cpu_family == 'arm'
+
+  babl_base_arm_neon = static_library('babl_base-arm-neon',
+    babl_base_sources,
+    include_directories: [rootInclude, bablInclude],
+    dependencies: [math, lcms],
+    c_args: common_c_flags + arm_neon_flags
+  )
+
+endif
diff --git a/babl/base/model-cmyk.c b/babl/base/model-cmyk.c
index 13fdedf..1fa02be 100644
--- a/babl/base/model-cmyk.c
+++ b/babl/base/model-cmyk.c
@@ -613,7 +613,7 @@ cmy_to_rgba (const Babl *conversion,
 #endif
 
 void
-babl_base_model_cmyk (void)
+BABL_SIMD_SUFFIX (babl_base_model_cmyk) (void)
 {
   babl_component_new ("cyan", NULL);
   babl_component_new ("yellow", NULL);
diff --git a/babl/base/model-gray.c b/babl/base/model-gray.c
index 3862400..7441baa 100644
--- a/babl/base/model-gray.c
+++ b/babl/base/model-gray.c
@@ -31,7 +31,7 @@ static void formats (void);
 static void init_single_precision (void);
 
 void 
-babl_base_model_gray (void)
+BABL_SIMD_SUFFIX (babl_base_model_gray) (void)
 {
   components ();
   models ();
@@ -90,7 +90,6 @@ models (void)
     "linear",
     NULL);
 
-
   babl_model_new (
     "id", BABL_GRAY_ALPHA,
     babl_component_from_id (BABL_GRAY_LINEAR),
diff --git a/babl/base/model-rgb.c b/babl/base/model-rgb.c
index a3064ef..824665a 100644
--- a/babl/base/model-rgb.c
+++ b/babl/base/model-rgb.c
@@ -32,7 +32,7 @@ static void formats (void);
 static void init_single_precision (void);
 
 void
-babl_base_model_rgb (void)
+BABL_SIMD_SUFFIX (babl_base_model_rgb) (void)
 {
   components ();
   models ();
diff --git a/babl/base/model-ycbcr.c b/babl/base/model-ycbcr.c
index 64db6a2..e061298 100644
--- a/babl/base/model-ycbcr.c
+++ b/babl/base/model-ycbcr.c
@@ -34,7 +34,7 @@ static void conversions (void);
 static void formats (void);
 
 void
-babl_base_model_ycbcr (void)
+BABL_SIMD_SUFFIX (babl_base_model_ycbcr) (void)
 {
   components ();
   models ();
diff --git a/babl/base/pow-24.h b/babl/base/pow-24.h
index ecd1282..98e2374 100644
--- a/babl/base/pow-24.h
+++ b/babl/base/pow-24.h
@@ -98,7 +98,7 @@ static inline float babl_frexpf(float x, int *e)
 
         if (!ee) {
                 if (x) {
-                        x = babl_frexpf(x*18446744073709551616.0, e);
+                        x = babl_frexpf(x*18446744073709551616.0f, e);
                         *e -= 64;
                 } else *e = 0;
                 return x;
@@ -130,11 +130,12 @@ static inline float babl_frexpf(float x, int *e)
 static inline float
 init_newtonf (float x, float exponent, float c0, float c1, float c2)
 {
+#define fM_LN2 0.69314718055994530942f
     int iexp = 0;
     float y = babl_frexpf(x, &iexp);
     y = 2*y+(iexp-2);
-    c1 *= M_LN2*exponent;
-    c2 *= M_LN2*M_LN2*exponent*exponent;
+    c1 *= fM_LN2*exponent;
+    c2 *= fM_LN2*fM_LN2*exponent*exponent;
     return y = c0 + c1*y + c2*y*y;
 }
 
diff --git a/babl/base/type-float.c b/babl/base/type-float.c
index 5b03b3f..9517831 100644
--- a/babl/base/type-float.c
+++ b/babl/base/type-float.c
@@ -83,7 +83,7 @@ convert_float_float (const Babl *babl,
 
 
 void
-babl_base_type_float (void)
+BABL_SIMD_SUFFIX (babl_base_type_float) (void)
 {
   babl_type_new (
     "float",
diff --git a/babl/base/type-half.c b/babl/base/type-half.c
index 862d662..a146185 100644
--- a/babl/base/type-half.c
+++ b/babl/base/type-half.c
@@ -395,7 +395,7 @@ convert_half_float (BablConversion *conversion,
 
 
 void
-babl_base_type_half (void)
+BABL_SIMD_SUFFIX (babl_base_type_half) (void)
 {
   babl_type_new (
     "half",
diff --git a/babl/base/type-u15.c b/babl/base/type-u15.c
index ea35453..7224c63 100644
--- a/babl/base/type-u15.c
+++ b/babl/base/type-u15.c
@@ -198,7 +198,7 @@ convert_u15_float_scaled (BablConversion *conversion,
 MAKE_CONVERSIONS_float (u15, 0.0, 1.0, 0, (1<<15))
 
 void
-babl_base_type_u15 (void)
+BABL_SIMD_SUFFIX (babl_base_type_u15) (void)
 {
   babl_hmpf_on_name_lookups--;
   babl_type_new (
diff --git a/babl/base/type-u16.c b/babl/base/type-u16.c
index c5a41dc..e7ab936 100644
--- a/babl/base/type-u16.c
+++ b/babl/base/type-u16.c
@@ -196,7 +196,7 @@ MAKE_CONVERSIONS_float (u16, 0.0, 1.0, 0, UINT16_MAX)
 
 
 void
-babl_base_type_u16 (void)
+BABL_SIMD_SUFFIX (babl_base_type_u16) (void)
 {
   babl_type_new (
     "u16",
diff --git a/babl/base/type-u32.c b/babl/base/type-u32.c
index 48b1506..7d07ff1 100644
--- a/babl/base/type-u32.c
+++ b/babl/base/type-u32.c
@@ -69,7 +69,7 @@ convert_u32_double_scaled (BablConversion *c,
 {
   while (n--)
     {
-      int    u32val = *(uint32_t *) src;
+      uint32_t    u32val = *(uint32_t *) src;
       double dval;
 
       if (u32val < min)
@@ -154,7 +154,7 @@ convert_u32_float_scaled (BablConversion *c,
 {
   while (n--)
     {
-      int    u32val = *(uint32_t *) src;
+      uint32_t u32val = *(uint32_t *) src;
       float dval;
 
       if (u32val < min)
@@ -196,7 +196,7 @@ MAKE_CONVERSIONS_float(u32, 0.0, 1.0, 0, UINT32_MAX)
 
 
 void
-babl_base_type_u32 (void)
+BABL_SIMD_SUFFIX (babl_base_type_u32) (void)
 {
   babl_type_new (
     "u32",
diff --git a/babl/base/type-u8.c b/babl/base/type-u8.c
index d41d5e0..9abbf67 100644
--- a/babl/base/type-u8.c
+++ b/babl/base/type-u8.c
@@ -202,7 +202,7 @@ MAKE_CONVERSIONS_float (u8_chroma, -0.5, 0.5, 16, 240)
 
 
 void
-babl_base_type_u8 (void)
+BABL_SIMD_SUFFIX (babl_base_type_u8) (void)
 {
   babl_type_new (
     "u8",
diff --git a/babl/base/util.h b/babl/base/util.h
index aba9c61..0d50363 100644
--- a/babl/base/util.h
+++ b/babl/base/util.h
@@ -50,23 +50,17 @@
 static inline double
 babl_epsilon_for_zero (double value)
 {
- if (value <=  BABL_ALPHA_FLOOR &&
-     value >= -BABL_ALPHA_FLOOR)
- {
-   return BABL_ALPHA_FLOOR;
- }
- return value;
+   return value * (value >  BABL_ALPHA_FLOOR || value < -BABL_ALPHA_FLOOR) +
+          BABL_ALPHA_FLOOR * (value <=  BABL_ALPHA_FLOOR &&
+                              value >= -BABL_ALPHA_FLOOR);
 }
 
 static inline float
 babl_epsilon_for_zero_float (float value)
 {
- if (value <= BABL_ALPHA_FLOOR_F &&
-     value >= -BABL_ALPHA_FLOOR_F)
- {
-   return BABL_ALPHA_FLOOR_F;
- }
- return value;
+   return value * (value >  BABL_ALPHA_FLOOR_F || value < -BABL_ALPHA_FLOOR_F) +
+          BABL_ALPHA_FLOOR_F * (value <=  BABL_ALPHA_FLOOR_F &&
+                              value >= -BABL_ALPHA_FLOOR_F);
 }