Record babl (1:0.1.106-2) in archive suite sid

author: Jeremy Bícha <jbicha@ubuntu.com> 2023-06-12 18:06:10 +0100
committer: Jeremy Bícha <jbicha@ubuntu.com> 2023-06-12 18:06:10 +0100
commit: b996e42137121c616e778befb4aab16bfc633d7b (patch)
tree: d2ed9b4a33ee787f2dc82964c43d45447766c7d6
parent: ca2ad4d784f40fc6a4afde8ec0b46cb92ad72176 (diff)
parent: 6c53911389104733e23c52f55d002d11ed6b9458 (diff)
130 files changed, 6216 insertions, 2224 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a3523b7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+# Meson
+_build
diff --git a/.gitlab-ci.yml-ignored b/.gitlab-ci.yml-ignored
new file mode 100644
index 0000000..3bb72ab
--- /dev/null
+++ b/.gitlab-ci.yml-ignored
@@ -0,0 +1,48 @@
+cache:
+  paths:
+    - _pacman_cache
+
+.build:
+  stage: build
+  image: archlinux:latest
+  variables:
+    PACMAN_CACHE: $CI_PROJECT_DIR/_pacman_cache
+  artifacts:
+    when: always
+    paths:
+      - _build/meson-logs
+  before_script:
+    - pacman -Syu --noconfirm --needed --cachedir $PACMAN_CACHE --ignore glibc
+        base-devel
+        git
+        gobject-introspection
+        meson
+        vala
+        pacman-contrib
+        ${EXTRA_PKGS}
+  script:
+    - meson _build
+        -D enable-f16c=true
+        -D enable-mmx=true
+        -D enable-sse=true
+        -D enable-sse2=true
+        -D enable-sse4_1=true
+        -D with-docs=true
+        ${EXTRA_OPTIONS}
+    - ninja -C _build
+    - ninja -C _build test
+  after_script:
+    # Remove all cached packages but the latest version
+    #- paccache -r -k1 --cachedir $PACMAN_CACHE
+
+latest-lcms:
+  extends: .build
+  variables:
+    EXTRA_OPTIONS : "-Dwith-lcms=true"
+    EXTRA_PKGS: "lcms2"
+
+latest-nolcms:
+  extends: .build
+  variables:
+    EXTRA_OPTIONS : "-Dwith-lcms=false"
+    EXTRA_PKGS: ""
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..07d7ffe
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "files.associations": {
+        "*.jnlp": "xml",
+        "babl-internal.h": "c"
+    }
+}
+\ No newline at end of file
diff --git a/NEWS b/NEWS
index d02e5c3..875fc85 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,59 @@ The NEWS file in the babl source tree is the source location for
 the news section both in the README and the webpage.
                                                                           -->
 
+2023-05-05 babl-0.1.106                                             </dt><dd>
+Disable LUTs on big-endian, fix to 1bpp->4bpp LUTs, faster startup by caching
+balanced RGB to XYZ matrices.
+                                                                    </dd><dt>
+2023-04-21 babl-0.1.104                                             </dt><dd>
+LUT code-paths re-enabled, some array overflow proofing.
+                                                                    </dd><dt>
+2023-02-25 babl-0.1.102                                             </dt><dd>
+Brown paper bag release - LUT code-paths now disabled by default.
+                                                                    </dd><dt>
+2023-02-20 babl-0.1.100                                             </dt><dd>
+Stop double processing with LUT+normal fishes.
+Support for non-ASCII characters in file paths on windows. Improved wrap build
+support. 
+                                                                    </dd><dt>
+2022-11-13 babl-0.1.98                                              </dt><dd>
+More robust bounds protection in ICC handling, avoid garbage collecting lookup
+tables in-line with processing.
+                                                                    </dd><dt>
+2022-08-23 babl-0.1.96                                              </dt><dd>
+Minor changes from 0.1.94, fixing build.
+                                                                    </dd><dt>
+2022-08-21 babl-0.1.94                                              </dt><dd>
+Fix of crash on non-aligned data for SIMD, add commandline tool, improve vala
+compatibility of introspection info.
+                                                                    </dd><dt>
+2022-03-22 babl-0.1.92                                              </dt><dd>
+Fixes for avoiding load of wrong SIMD extensions.
+                                                                    </dd><dt>
+2022-02-21 babl-0.1.90                                              </dt><dd>
+Automatic LUT creation for conversions from &lt;24bpp of colorant componants,
+SIMD builds for x86_64 micro-architecture levels and ARM neon, pre-defined
+CIE Lab u8 and CIE Lab u16 formats. Fix bug of an unlocked non-locked mutex,
+which started failing on some platforms.
+                                                                    </dd><dt>
+2021-07-01 babl-0.1.88                                              </dt><dd>
+Consistency fixes to format names, in particular for palettized formats.
+                                                                    </dd><dt>
+2021-02-26 babl-0.1.86                                              </dt><dd>
+input-class (scanner/RGB) ICC profiles are valid for creating RGB spaces.
+improved thread safety for babl_space_from_icc
+build: allow using babl a subproject in other meson projects.
+                                                                    </dd><dt>
+2020-12-17 babl-0.1.84                                              </dt><dd>
+Fixed bug in caching of CMYK ICC profiles.
+                                                                    </dd><dt>
+2020-10-04 babl-0.1.82                                              </dt><dd>
+Handle the parametric ICCv4 types that are not strictly the same type as core
+sRGB curve.
+                                                                    </dd><dt>
+2020-08-02 babl-0.1.80                                              </dt><dd>
+meson build cleanups, depend on meson 0.54
+                                                                    </dd><dt>
 2020-06-07 babl-0.1.78                                              </dt><dd>
 Bugfix release - mutex and atomicity related issues.
                                                                     </dd><dt>
diff --git a/babl/babl-cache.c b/babl/babl-cache.c
index 63ae19e..efdce3f 100644
--- a/babl/babl-cache.c
+++ b/babl/babl-cache.c
@@ -35,72 +35,112 @@
 static int
 mk_ancestry_iter (const char *path)
 {
-  char copy[4096];
-  strncpy (copy, path, 4096);
-  copy[sizeof (copy) - 1] = '\0';
-  if (strrchr (copy, '/'))
+  char *copy = babl_strdup (path);
+  char *rchr = NULL;
+  int result = 0;
+
+  if (!copy)
+    return -1;
+
+  rchr = strrchr (copy, '/');
+  if (rchr)
     {
-      *strrchr (copy, '/') = '\0';
+      *rchr = '\0';
+
       if (copy[0])
         {
-          struct stat stat_buf;
-          if ( ! (stat (copy, &stat_buf)==0 && S_ISDIR(stat_buf.st_mode)))
-            {
-              if (mk_ancestry_iter (copy) != 0)
-                return -1;
-#ifndef _WIN32
-              return mkdir (copy, S_IRWXU);
-#else
-              return mkdir (copy);
-#endif
-            }
+          BablStat stat_buf;
+          if ( ! (_babl_stat (copy, &stat_buf)==0 && S_ISDIR(stat_buf.st_mode)))
+            result = mk_ancestry_iter (copy) == 0 ? _babl_mkdir (copy, S_IRWXU) : -1;
         }
     }
-  return 0;
+
+  babl_free (copy);
+  return result;
 }
 
 static int
 mk_ancestry (const char *path)
 {
-  char copy[4096];
-  strncpy (copy, path, 4096);
-  copy[sizeof (copy) - 1] = '\0';
+  char *copy = babl_strdup (path);
+  int result = 0;
+
+  if (!copy)
+    return -1;
+
 #ifdef _WIN32
   for (char *c = copy; *c; c++)
     if (*c == '\\')
       *c = '/';
 #endif
-  return mk_ancestry_iter (copy);
+
+  result = mk_ancestry_iter (copy);
+
+  babl_free (copy);
+  return result;
 }
 
-static const char *
+static char *
 fish_cache_path (void)
 {
-  struct stat stat_buf;
-  static char path[4096];
+  char *path = NULL;
+  char buf[4096];
+  BablStat stat_buf;
 
-  strncpy (path, FALLBACK_CACHE_PATH, 4096);
-  path[sizeof (path) - 1] = '\0';
 #ifndef _WIN32
+
+  strncpy (buf, FALLBACK_CACHE_PATH, 4096);
+  buf[sizeof (buf) - 1] = '\0';
+
   if (getenv ("XDG_CACHE_HOME"))
-    snprintf (path, sizeof (path), "%s/babl/babl-fishes", getenv("XDG_CACHE_HOME"));
+    snprintf (buf, sizeof (buf), "%s/babl/babl-fishes", getenv("XDG_CACHE_HOME"));
   else if (getenv ("HOME"))
-    snprintf (path, sizeof (path), "%s/.cache/babl/babl-fishes", getenv("HOME"));
+    snprintf (buf, sizeof (buf), "%s/.cache/babl/babl-fishes", getenv("HOME"));
+
+  path = babl_strdup (buf);
+
 #else
-{
-  char win32path[4096];
-  if (SHGetFolderPathA (NULL, CSIDL_LOCAL_APPDATA, NULL, SHGFP_TYPE_CURRENT, win32path) == S_OK)
-    snprintf (path, sizeof (path), "%s\\%s\\babl-fishes.txt", win32path, BABL_LIBRARY);
+
+  wchar_t *appdata_utf16 = NULL;
+
+  if (SHGetKnownFolderPath (&FOLDERID_LocalAppData, KF_FLAG_DEFAULT, NULL, &appdata_utf16) == S_OK)
+    {
+      char *appdata = babl_convert_utf16_to_utf8 (appdata_utf16);
+
+      if (appdata && appdata[0])
+        {
+          const char *fmt = "%s\\%s\\babl-fishes.txt";
+          size_t sz = add_check_overflow (3, strlen (fmt), strlen (appdata), strlen (BABL_LIBRARY));
+
+          if (sz > 0 && (path = babl_malloc (sz)) != NULL)
+            _snprintf_s (path, sz, sz, fmt, appdata, BABL_LIBRARY);
+        }
+
+      if (appdata)
+        babl_free (appdata);
+    }
   else if (getenv ("TEMP"))
-    snprintf (path, sizeof (path), "%s\\babl-fishes.txt", getenv("TEMP"));
-}
+    {
+      snprintf (buf, sizeof (buf), "%s\\babl-fishes.txt", getenv("TEMP"));
+      path = babl_strdup (buf);
+    }
+
+  if (appdata_utf16)
+    {
+      CoTaskMemFree (appdata_utf16);
+      appdata_utf16 = NULL;
+    }
+
 #endif
 
-  if (stat (path, &stat_buf)==0 && S_ISREG(stat_buf.st_mode))
+  if (!path)
+    return babl_strdup (FALLBACK_CACHE_PATH);
+
+  if (_babl_stat (path, &stat_buf) == 0 && S_ISREG(stat_buf.st_mode))
     return path;
 
   if (mk_ancestry (path) != 0)
-    return FALLBACK_CACHE_PATH;
+    return babl_strdup (FALLBACK_CACHE_PATH);
 
   return path;
 }
@@ -177,23 +217,23 @@ cache_header (void)
   return buf;
 }
 
-void 
+void
 babl_store_db (void)
 {
   BablDb *db = babl_fish_db ();
-  int i;
+  char *cache_path = fish_cache_path ();
   char *tmpp = calloc(8000,1);
-  FILE *dbfile;
+  FILE *dbfile = NULL;
+  int i;
+
+  if (!cache_path || !tmpp)
+    goto cleanup;
 
-  if (!tmpp)
-    return;
-  snprintf (tmpp, 8000, "%s~", fish_cache_path ());
-  dbfile  = fopen (tmpp, "w");
+  snprintf (tmpp, 8000, "%s~", cache_path);
+  dbfile  = _babl_fopen (tmpp, "w");
   if (!dbfile)
-  {
-    free (tmpp);
-    return;
-  }
+    goto cleanup;
+
   fprintf (dbfile, "%s\n", cache_header ());
 
   /* sort the list of fishes by usage, making next run more efficient -
@@ -209,13 +249,24 @@ babl_store_db (void)
     if (babl_fish_serialize (fish, tmp, 4096))
       fprintf (dbfile, "%s----\n", tmp);
   }
+
   fclose (dbfile);
+  dbfile = NULL;
 
 #ifdef _WIN32
-  remove (fish_cache_path ());
+  _babl_remove (cache_path);
 #endif
-  rename (tmpp, fish_cache_path());
-  free (tmpp);
+  _babl_rename (tmpp, cache_path);
+
+cleanup:
+  if (dbfile)
+    fclose (dbfile);
+
+  if (cache_path)
+    babl_free (cache_path);
+
+  if (tmpp)
+    free (tmpp);
 }
 
 int
@@ -230,7 +281,7 @@ _babl_fish_create_name (char       *buf,
 void 
 babl_init_db (void)
 {
-  const char *path = fish_cache_path ();
+  char *path = fish_cache_path ();
   long  length = -1;
   char  seps[] = "\n\r";
   Babl *babl   = NULL;
@@ -242,11 +293,11 @@ babl_init_db (void)
   time_t tim = time (NULL);
 
   if (getenv ("BABL_DEBUG_CONVERSIONS"))
-    return;
+    goto cleanup;
 
   _babl_file_get_contents (path, &contents, &length, NULL);
   if (!contents)
-    return;
+    goto cleanup;
 
   token = strtok_r (contents, seps, &tokp);
   while( token != NULL )
@@ -274,10 +325,7 @@ babl_init_db (void)
           /* if babl has changed in git .. drop whole cache */
           {
             if (strcmp ( token, cache_header ()))
-            {
-              free (contents);
-              return;
-            }
+              goto cleanup;
           }
           break;
         case '\t':
@@ -294,8 +342,7 @@ babl_init_db (void)
             {
               fprintf (stderr, "%s:%i: loading of cache failed\n",
                               __FUNCTION__, __LINE__);
-              free (contents);
-              return;
+              goto cleanup;
             }
 
             if (strstr (token, "[reference]"))
@@ -334,7 +381,6 @@ babl_init_db (void)
               babl->fish.source               = from_format;
               babl->fish.destination          = to_format;
               babl->fish_path.conversion_list = babl_list_init_with_size (10);
-              _babl_fish_prepare_bpp (babl);
               _babl_fish_rig_dispatch (babl);
             }
 
@@ -348,7 +394,10 @@ babl_init_db (void)
               else if (!strncmp (token2, "cost=", 5))
               {
                 if (babl->class_type == BABL_FISH_PATH)
+                {
                   babl->fish_path.cost = babl_parse_double (token2 + 5);
+                  _babl_fish_prepare_bpp (babl);
+                }
               }
               else if (!strncmp (token2, "pixels=", 7))
               {
@@ -382,6 +431,11 @@ babl_init_db (void)
       }
       token = strtok_r (NULL, seps, &tokp);
     }
+
+cleanup:
   if (contents)
     free (contents);
+
+  if (path)
+    babl_free (path);
 }
diff --git a/babl/babl-classes.h b/babl/babl-classes.h
index c25172a..cf03447 100644
--- a/babl/babl-classes.h
+++ b/babl/babl-classes.h
@@ -59,7 +59,7 @@ enum {
 
 #include "babl-type.h"
 #include "babl-sampling.h"
-#include "babl-trc.h"
+#include "base/babl-trc.h"
 #include "babl-space.h"
 #include "babl-component.h"
 #include "babl-model.h"
diff --git a/babl/babl-core.c b/babl/babl-core.c
index d78b5e5..2bd36c7 100644
--- a/babl/babl-core.c
+++ b/babl/babl-core.c
@@ -162,3 +162,184 @@ babl_core_init (void)
     NULL
   );
 }
+
+
+/////////////////// temporary here
+///////////////////
+
+const Babl * 
+babl_trc_lut (const char *name, 
+              int         n, 
+              float      *entries)
+{
+  return babl_trc_new (name, BABL_TRC_LUT, 0, n, entries);
+}
+
+
+const Babl *
+babl_trc_formula_srgb (double g, 
+                       double a, 
+                       double b, 
+                       double c, 
+                       double d,
+                       double e,
+                       double f)
+{
+  char name[128];
+  int i;
+  float params[7]={g, a, b, c, d, e, f};
+
+  if (fabs (g - 2.400) < 0.01 &&
+      fabs (a - 0.947) < 0.01 &&
+      fabs (b - 0.052) < 0.01 &&
+      fabs (c - 0.077) < 0.01 &&
+      fabs (d - 0.040) < 0.01 &&
+      fabs (e - 0.000) < 0.01 &&
+      fabs (f - 0.000) < 0.01
+      )
+    return babl_trc ("sRGB");
+
+  snprintf (name, sizeof (name), "%.6f %.6f %.4f %.4f %.4f %.4f %.4f", g, a, b, c, d, e, f);
+  for (i = 0; name[i]; i++)
+    if (name[i] == ',') name[i] = '.';
+  while (name[strlen(name)-1]=='0')
+    name[strlen(name)-1]='\0';
+  return babl_trc_new (name, BABL_TRC_FORMULA_SRGB, g, 0, params);
+}
+
+const Babl *
+babl_trc_formula_cie (double g, 
+                      double a, 
+                      double b, 
+                      double c)
+{
+  char name[128];
+  int i;
+  float params[4]={g, a, b, c};
+
+  snprintf (name, sizeof (name), "%.6f %.6f %.4f %.4f", g, a, b, c);
+  for (i = 0; name[i]; i++)
+    if (name[i] == ',') name[i] = '.';
+  while (name[strlen(name)-1]=='0')
+    name[strlen(name)-1]='\0';
+  return babl_trc_new (name, BABL_TRC_FORMULA_CIE, g, 0, params);
+}
+
+
+const Babl *
+babl_trc_gamma (double gamma)
+{
+  char name[32];
+  int i;
+  if (fabs (gamma - 1.0) < 0.01)
+     return babl_trc_new ("linear", BABL_TRC_LINEAR, 1.0, 0, NULL);
+
+  snprintf (name, sizeof (name), "%.6f", gamma);
+  for (i = 0; name[i]; i++)
+    if (name[i] == ',') name[i] = '.';
+  while (name[strlen(name)-1]=='0')
+    name[strlen(name)-1]='\0';
+  return babl_trc_new (name, BABL_TRC_FORMULA_GAMMA, gamma, 0, NULL);
+}
+
+void
+babl_trc_class_init (void)
+{
+  babl_trc_new ("sRGB",  BABL_TRC_SRGB, 2.2, 0, NULL);
+  babl_trc_gamma (2.2);
+  babl_trc_gamma (1.8);
+  babl_trc_gamma (1.0);
+  babl_trc_new ("linear", BABL_TRC_LINEAR, 1.0, 0, NULL);
+}
+
+#if 0
+float 
+babl_trc_from_linear (const Babl *trc_, 
+                      float       value)
+{
+  return babl_trc_from_linear (trc_, value);
+}
+
+float 
+babl_trc_to_linear (const Babl *trc_,
+                    float       value)
+{
+  return babl_trc_to_linear (trc_, value);
+}
+#endif
+
+static int
+babl_lut_match_gamma (float *lut, 
+                      int    lut_size, 
+                      float  gamma)
+{
+  int match = 1;
+  int i;
+  if (lut_size > 1024)
+  {
+    for (i = 0; match && i < lut_size; i++)
+    {
+      if (fabs (lut[i] - pow ((i / (lut_size-1.0)), gamma)) > 0.0001)
+        match = 0;
+    }
+  }
+  else
+  {
+    for (i = 0; match && i < lut_size; i++)
+    {
+      if (fabs (lut[i] - pow ((i / (lut_size-1.0)), gamma)) > 0.001)
+        match = 0;
+    }
+  }
+  return match;
+}
+
+const Babl *
+babl_trc_lut_find (float *lut, 
+                   int    lut_size)
+{
+  int i;
+  int match = 1;
+
+  /* look for linear match */
+  for (i = 0; match && i < lut_size; i++)
+    if (fabs (lut[i] - i / (lut_size-1.0)) > 0.015)
+      match = 0;
+  if (match)
+    return babl_trc_gamma (1.0);
+
+  /* look for sRGB match: */
+  match = 1;
+  if (lut_size > 1024)
+  {
+    for (i = 0; match && i < lut_size; i++)
+    {
+      if (fabs (lut[i] - gamma_2_2_to_linear (i / (lut_size-1.0))) > 0.0001)
+        match = 0;
+    }
+  }
+  else
+  {
+    for (i = 0; match && i < lut_size; i++)
+    {
+      if (fabs (lut[i] - gamma_2_2_to_linear (i / (lut_size-1.0))) > 0.001)
+        match = 0;
+    }
+  }
+  if (match)
+    return babl_trc ("sRGB");
+
+  if (babl_lut_match_gamma (lut, lut_size, 2.2))
+    return babl_trc_gamma(2.2);
+
+  if (babl_lut_match_gamma (lut, lut_size, 1.8))
+    return babl_trc_gamma(1.8);
+
+  return NULL;
+}
+
+const Babl * babl_trc (const char *name)
+{
+  return babl_trc_lookup_by_name (name);
+}
+
diff --git a/babl/babl-cpuaccel.c b/babl/babl-cpuaccel.c
index ef26fa5..2cfff1b 100644
--- a/babl/babl-cpuaccel.c
+++ b/babl/babl-cpuaccel.c
@@ -14,11 +14,10 @@
  * You should have received a copy of the GNU Lesser General
  * Public License along with this library; if not, see
  * <https://www.gnu.org/licenses/>.
+ *
+ * (c) Manish Singh, Aaron Holtzman, Jan Heller, Ell, Øyvind Kolås
  */
 
-/*
- * x86 bits Copyright (C) Manish Singh <yosh@gimp.org>
- */
 
 /*
  * PPC CPU acceleration detection was taken from DirectFB but seems to be
@@ -78,7 +77,6 @@ babl_cpu_accel_set_use (gboolean use)
 
 #define HAVE_ACCEL 1
 
-
 typedef enum
 {
   ARCH_X86_VENDOR_NONE,
@@ -117,15 +115,26 @@ enum
 {
   ARCH_X86_INTEL_FEATURE_PNI      = 1 << 0,
   ARCH_X86_INTEL_FEATURE_SSSE3    = 1 << 9,
+  ARCH_X86_INTEL_FEATURE_FMA      = 1 << 12,
   ARCH_X86_INTEL_FEATURE_SSE4_1   = 1 << 19,
   ARCH_X86_INTEL_FEATURE_SSE4_2   = 1 << 20,
+  ARCH_X86_INTEL_FEATURE_MOVBE    = 1 << 22,
+  ARCH_X86_INTEL_FEATURE_POPCNT   = 1 << 23,
+  ARCH_X86_INTEL_FEATURE_XSAVE    = 1 << 26,
+  ARCH_X86_INTEL_FEATURE_OSXSAVE  = 1 << 27,
   ARCH_X86_INTEL_FEATURE_AVX      = 1 << 28,
   ARCH_X86_INTEL_FEATURE_F16C     = 1 << 29,
 
-  /* extended features */
-  ARCH_X86_INTEL_FEATURE_AVX2     = 1 << 5
+ // extended features
+
+  ARCH_X86_INTEL_FEATURE_BMI1     = 1 << 3,
+  ARCH_X86_INTEL_FEATURE_BMI2     = 1 << 8,
+  ARCH_X86_INTEL_FEATURE_AVX2     = 1 << 5,
 };
 
+
+/* x86 asm bit Copyright (C) Manish Singh <yosh@gimp.org>
+ */
 #if !defined(ARCH_X86_64) && (defined(PIC) || defined(__PIC__))
 #define cpuid(op,eax,ebx,ecx,edx)  \
   __asm__ ("movl %%ebx, %%esi\n\t" \
@@ -136,7 +145,7 @@ enum
              "=S" (ebx),           \
              "=c" (ecx),           \
              "=d" (edx)            \
-           : "0" (op))
+           : "0" (op), "2" (0))
 #else
 #define cpuid(op,eax,ebx,ecx,edx)  \
   __asm__ ("xor %%ecx, %%ecx\n\t"  \
@@ -145,7 +154,7 @@ enum
              "=b" (ebx),           \
              "=c" (ecx),           \
              "=d" (edx)            \
-           : "0" (op))
+           : "0" (op), "2" (0))
 #endif
 
 
@@ -256,19 +265,43 @@ arch_accel_intel (void)
     if (ecx & ARCH_X86_INTEL_FEATURE_SSE4_1)
       caps |= BABL_CPU_ACCEL_X86_SSE4_1;
 
+    if (ecx & ARCH_X86_INTEL_FEATURE_SSE4_2)
+      caps |= BABL_CPU_ACCEL_X86_SSE4_2;
+
+    if (ecx & ARCH_X86_INTEL_FEATURE_AVX)
+      caps |= BABL_CPU_ACCEL_X86_AVX;
+
+    if (ecx & ARCH_X86_INTEL_FEATURE_POPCNT)
+      caps |= BABL_CPU_ACCEL_X86_POPCNT;
+
+    if (ecx & ARCH_X86_INTEL_FEATURE_XSAVE)
+      caps |= BABL_CPU_ACCEL_X86_XSAVE;
+
+    if (ecx & ARCH_X86_INTEL_FEATURE_OSXSAVE)
+      caps |= BABL_CPU_ACCEL_X86_OSXSAVE;
+
+    if (ecx & ARCH_X86_INTEL_FEATURE_FMA)
+      caps |= BABL_CPU_ACCEL_X86_FMA;
+
     if (ecx & ARCH_X86_INTEL_FEATURE_F16C)
       caps |= BABL_CPU_ACCEL_X86_F16C;
 
-    cpuid (0, eax, ebx, ecx, edx);
+    if (ecx & ARCH_X86_INTEL_FEATURE_MOVBE)
+      caps |= BABL_CPU_ACCEL_X86_MOVBE;
 
+    cpuid (0, eax, ebx, ecx, edx);
     if (eax >= 7)
-      {
-        cpuid (7, eax, ebx, ecx, edx);
-
-        if (ebx & ARCH_X86_INTEL_FEATURE_AVX2)
-          caps |= BABL_CPU_ACCEL_X86_AVX2;
-      }
+    {
+      cpuid (7, eax, ebx, ecx, edx);
+      if (ebx & ARCH_X86_INTEL_FEATURE_AVX2)
+        caps |= BABL_CPU_ACCEL_X86_AVX2;
+      if (ebx & ARCH_X86_INTEL_FEATURE_BMI1)
+        caps |= BABL_CPU_ACCEL_X86_BMI1;
+      if (ebx & ARCH_X86_INTEL_FEATURE_BMI2)
+        caps |= BABL_CPU_ACCEL_X86_BMI2;
+    }
 #endif /* USE_SSE */
+
   }
 #endif /* USE_MMX */
 
@@ -517,6 +550,41 @@ arch_accel (void)
 
 #endif /* ARCH_PPC && USE_ALTIVEC */
 
+#if defined(ARCH_ARM)
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <elf.h>
+
+#define HAVE_ACCEL 1
+
+static guint32
+arch_accel (void)
+{
+  /* TODO : add or hardcode the other ways it can be on arm, where
+   *        this info comes from the system and not from running cpu
+   *        instructions
+   */
+  int has_neon = 0;
+  int fd = open ("/proc/self/auxv", O_RDONLY);
+  Elf32_auxv_t auxv;
+  if (fd >= 0)
+  {
+    while (read (fd, &auxv, sizeof (Elf32_auxv_t)) == sizeof (Elf32_auxv_t))
+    {
+      if (auxv.a_type == AT_HWCAP)
+      {
+        if (auxv.a_un.a_val & 4096)
+          has_neon = 1;
+      }
+    }
+    close (fd);
+  }
+  return has_neon?BABL_CPU_ACCEL_ARM_NEON:0;
+}
+
+#endif /* ARCH_ARM  */
 
 static BablCpuAccelFlags
 cpu_accel (void)
diff --git a/babl/babl-cpuaccel.h b/babl/babl-cpuaccel.h
index b8a6855..133d138 100644
--- a/babl/babl-cpuaccel.h
+++ b/babl/babl-cpuaccel.h
@@ -24,25 +24,55 @@ typedef enum
   BABL_CPU_ACCEL_NONE        = 0x0,
 
   /* x86 accelerations */
-  BABL_CPU_ACCEL_X86_MMX     = 0x01000000,
+  BABL_CPU_ACCEL_X86_MMX     = 0x80000000,
   BABL_CPU_ACCEL_X86_3DNOW   = 0x40000000,
   BABL_CPU_ACCEL_X86_MMXEXT  = 0x20000000,
   BABL_CPU_ACCEL_X86_SSE     = 0x10000000,
   BABL_CPU_ACCEL_X86_SSE2    = 0x08000000,
-  BABL_CPU_ACCEL_X86_SSE3    = 0x02000000,
-  BABL_CPU_ACCEL_X86_SSSE3   = 0x00800000,
-  BABL_CPU_ACCEL_X86_SSE4_1  = 0x00400000,
-  /* BABL_CPU_ACCEL_X86_SSE4_2  = 0x00200000, */
-  /* BABL_CPU_ACCEL_X86_AVX     = 0x00080000, */
+  BABL_CPU_ACCEL_X86_SSE3    = 0x04000000,
+  BABL_CPU_ACCEL_X86_SSSE3   = 0x02000000,
+  BABL_CPU_ACCEL_X86_SSE4_1  = 0x01000000,
+  BABL_CPU_ACCEL_X86_SSE4_2  = 0x00800000,
+  BABL_CPU_ACCEL_X86_AVX     = 0x00400000,
+  BABL_CPU_ACCEL_X86_POPCNT  = 0x00200000,
+  BABL_CPU_ACCEL_X86_FMA     = 0x00100000,
+  BABL_CPU_ACCEL_X86_MOVBE   = 0x00080000,
   BABL_CPU_ACCEL_X86_F16C    = 0x00040000,
-  BABL_CPU_ACCEL_X86_AVX2    = 0x00020000,
+  BABL_CPU_ACCEL_X86_XSAVE   = 0x00020000,
+  BABL_CPU_ACCEL_X86_OSXSAVE = 0x00010000,
+  BABL_CPU_ACCEL_X86_BMI1    = 0x00008000,
+  BABL_CPU_ACCEL_X86_BMI2    = 0x00004000,
+  BABL_CPU_ACCEL_X86_AVX2    = 0x00002000,
+
+  BABL_CPU_ACCEL_X86_64_V2 =
+    (BABL_CPU_ACCEL_X86_POPCNT|
+     BABL_CPU_ACCEL_X86_SSE4_1|
+     BABL_CPU_ACCEL_X86_SSE4_2|
+     BABL_CPU_ACCEL_X86_SSSE3),
+
+  BABL_CPU_ACCEL_X86_64_V3 =
+    (BABL_CPU_ACCEL_X86_64_V2|
+     BABL_CPU_ACCEL_X86_BMI1|
+     BABL_CPU_ACCEL_X86_BMI2|
+     BABL_CPU_ACCEL_X86_AVX|
+     BABL_CPU_ACCEL_X86_FMA|
+     BABL_CPU_ACCEL_X86_F16C|
+     BABL_CPU_ACCEL_X86_AVX2|
+     BABL_CPU_ACCEL_X86_OSXSAVE|
+     BABL_CPU_ACCEL_X86_MOVBE),
 
   /* powerpc accelerations */
-  BABL_CPU_ACCEL_PPC_ALTIVEC = 0x04000000,
-  BABL_CPU_ACCEL_X86_64      = 0x00100000
+  BABL_CPU_ACCEL_PPC_ALTIVEC = 0x00000010,
+
+  /* arm accelerations */
+  BABL_CPU_ACCEL_ARM_NEON    = 0x00000020,
+
+  /* x86_64 arch */
+  BABL_CPU_ACCEL_X86_64      = 0x00000040
 } BablCpuAccelFlags;
 
 
+
 BablCpuAccelFlags  babl_cpu_accel_get_support (void);
 void               babl_cpu_accel_set_use     (unsigned int use);
 
diff --git a/babl/babl-extension.c b/babl/babl-extension.c
index 41edb8e..0d36eef 100644
--- a/babl/babl-extension.c
+++ b/babl/babl-extension.c
@@ -31,8 +31,9 @@
 #include "babl-internal.h"
 #include "babl-db.h"
 #include "babl-base.h"
+
 #include <string.h>
-#include <stdarg.h>
+
 
 static Babl *babl_extension_current_extender = NULL;
 
@@ -107,7 +108,9 @@ babl_extension_base (void)
     if (ret != babl)
       babl_free (babl);
     else
+    {
       babl_base_init ();
+    }
     babl = ret;
   }
   babl_set_extender (NULL);
@@ -172,7 +175,22 @@ dlsym (HLIB        handle,
 #include <windows.h>
 #define HLIB    HINSTANCE
 
-#define dlopen(a, b)    LoadLibrary (a)
+static HLIB
+LoadLibraryWrap (const char *filename)
+{
+  wchar_t *filename_utf16 = babl_convert_utf8_to_utf16 (filename);
+  HLIB module = NULL;
+
+  if (!filename_utf16)
+    return NULL;
+
+  module = LoadLibraryW (filename_utf16);
+
+  babl_free (filename_utf16);
+  return module;
+}
+
+#define dlopen(a, b)    LoadLibraryWrap (a)
 #define dlsym(l, s)     GetProcAddress (l, s)
 #define dlclose(l)      FreeLibrary (l)
 #define dlerror()       GetLastError ()
@@ -242,39 +260,53 @@ babl_extension_load (const char *path)
     }
 }
 
+struct dir_foreach_ctx
+{
+  const char **exclusion_patterns;
+};
+
 static void
-babl_extension_load_dir (const char *base_path)
+dir_foreach (const char *base_path,
+             const char *entry,
+             void       *user_data)
 {
-  DIR *dir;
+  struct dir_foreach_ctx *ctx = (struct dir_foreach_ctx*) user_data;
 
-  if ((dir = opendir (base_path)))
+  if (entry[0] != '.')
     {
-      struct  dirent *dentry;
+      char       *path = NULL;
+      char       *extension;
+
+      path = babl_strcat (path, base_path);
+      path = babl_strcat (path, BABL_DIR_SEPARATOR);
+      path = babl_strcat (path, entry);
 
-      while ((dentry = readdir (dir)) != NULL)
+      if ((extension = strrchr (entry, '.')) != NULL &&
+          !strcmp (extension, SHREXT))
         {
-          if (dentry->d_name[0] != '.')
-            {
-              char       *path = NULL;
-              char       *extension;
-
-              path = babl_strcat (path, base_path);
-              path = babl_strcat (path, BABL_DIR_SEPARATOR);
-              path = babl_strcat (path, dentry->d_name);
-
-              if ((extension = strrchr (dentry->d_name, '.')) != NULL &&
-                  !strcmp (extension, SHREXT))
-                {
-                  babl_extension_load (path);
-                }
-
-              babl_free (path);
-            }
+          int excluded = 0;
+          for (int i = 0; ctx->exclusion_patterns[i]; i++)
+            if (strstr (path, ctx->exclusion_patterns[i]))
+              excluded = 1;
+          if (!excluded)
+            babl_extension_load (path);
         }
-      closedir (dir);
+
+      babl_free (path);
     }
 }
 
+static void
+babl_extension_load_dir (const char *base_path,
+                         const char **exclusion_patterns)
+{
+  struct dir_foreach_ctx ctx;
+
+  ctx.exclusion_patterns = exclusion_patterns;
+
+  _babl_dir_foreach (base_path, dir_foreach, &ctx);
+}
+
 static char *
 expand_path (char *path)
 {
@@ -312,7 +344,8 @@ expand_path (char *path)
 /*  parse the provided colon seperated list of paths to search
  */
 void
-babl_extension_load_dir_list (const char *dir_list)
+babl_extension_load_dir_list (const char *dir_list,
+                              const char **exclusion_patterns)
 {
   int         eos = 0;
   const char *src;
@@ -329,13 +362,13 @@ babl_extension_load_dir_list (const char *dir_list)
         {
           case '\0':
             eos = 1;
-            /* don't break here, the path needs to be processed */
-
+            // the path needs to be processed.
+            // fall through
           case BABL_PATH_SEPARATOR:
           {
             char *expanded_path = expand_path (path);
             if (expanded_path) {
-                babl_extension_load_dir (expanded_path);
+                babl_extension_load_dir (expanded_path, exclusion_patterns);
                 babl_free (expanded_path);
             }
           }
diff --git a/babl/babl-extension.h b/babl/babl-extension.h
index 82e1d7e..50fb731 100644
--- a/babl/babl-extension.h
+++ b/babl/babl-extension.h
@@ -28,7 +28,8 @@ BABL_CLASS_DECLARE (extension);
  */
 
 const  Babl * babl_extension               (const char *name);
-void          babl_extension_load_dir_list (const char *dir_list);
+void          babl_extension_load_dir_list (const char *dir_list,
+                                            const char **exclusion_patterns);
 
 typedef struct
 {
diff --git a/babl/babl-fish-path.c b/babl/babl-fish-path.c
index b0396db..712f57a 100644
--- a/babl/babl-fish-path.c
+++ b/babl/babl-fish-path.c
@@ -32,8 +32,523 @@
 #define MIN(a, b) (((a) > (b)) ? (b) : (a))
 #endif
 
+static int enable_lut = 0;
+
+typedef struct GcContext {
+   long time;
+} GcContext;
+
+static float lut_unused_minutes_limit = 5.0;
+
+static int lut_info_level = 0;
+
+#define _LUT_LOG(level, ...) do{\
+     if (level <= lut_info_level)\
+       fprintf (stdout, __VA_ARGS__);\
+     fflush(NULL);\
+     }while(0)
+#define LUT_LOG(...) _LUT_LOG(1, __VA_ARGS__)
+#define LUT_INFO(...) _LUT_LOG(2, __VA_ARGS__)
+#define LUT_DETAIL(...) _LUT_LOG(3, __VA_ARGS__)
+
+static int gc_fishes (Babl *babl, void *userdata)
+{
+  GcContext *context = userdata;
+  if (babl->class_type == BABL_FISH_PATH)
+  {
+    if (babl->fish_path.u8_lut)
+    {
+      if (context->time - babl->fish_path.last_lut_use >
+          1000 * 1000 * 60 * lut_unused_minutes_limit)
+      {
+        void *lut =babl->fish_path.u8_lut;
+        BABL(babl)->fish_path.u8_lut = NULL;
+        free (lut);
+        BABL(babl)->fish.pixels = 0;
+        LUT_LOG("freeing LUT %s to %s unused for >%.1f minutes\n",
+                babl_get_name (babl->conversion.source),
+                babl_get_name (babl->conversion.destination),
+                lut_unused_minutes_limit);
+      }
+      else if (lut_info_level >=4)
+      {
+        LUT_DETAIL("active LUT %s to %s  %8li pixels last used %.1f minutes ago\n",
+                babl_get_name (babl->conversion.source),
+                babl_get_name (babl->conversion.destination),
+                babl->fish.pixels,
+         (context->time - babl->fish_path.last_lut_use)/1000.0/1000.0/60.0);
+      }
+    }
+    else if (lut_info_level >= 4 && babl->fish.pixels)
+    {
+        if (babl->fish_path.is_u8_color_conv)
+        LUT_DETAIL("potential LUT %s to %s  %8li pixels\n",
+                babl_get_name (babl->conversion.source),
+                babl_get_name (babl->conversion.destination),
+                babl->fish.pixels);
+        else if (lut_info_level >=5)
+        LUT_DETAIL("%i step path %s to %s  %8li pixels\n",
+                babl->fish_path.conversion_list->count,
+                babl_get_name (babl->conversion.source),
+                babl_get_name (babl->conversion.destination),
+                babl->fish.pixels);
+    }
+    babl->fish.pixels /= 2; // decay pixel count// this is enough that we *will* reach 0
+  }
+  return 0;
+}
+                           
+static void
+babl_gc_fishes (void)
+{
+  GcContext context;
+  context.time = babl_ticks ();
+  if (lut_info_level >= 5)
+  {
+     fprintf (stdout, "\e[H\e[2J");
+  }
+  babl_fish_class_for_each (gc_fishes, &context);
+}
+
+static long babl_conv_counter = 0;
+
+void
+babl_gc (void)
+{
+  if (babl_conv_counter > 1000 * 1000 * 10) // run gc every 10 megapixels
+  {
+    babl_conv_counter = 0;
+    babl_gc_fishes ();
+    //malloc_trim (0); 
+    //  is responsibility of higher layers
+  }
+}
+
+#define BABL_LIKELY(x)      __builtin_expect(!!(x), 1)
+#define BABL_UNLIKELY(x)    __builtin_expect(!!(x), 0)
+
+static float timings[256] = {0,};
+
+#define BPP_4ASSOCIATED   14
+
+static inline int _do_lut (uint32_t *lut,
+                           int   source_bpp,
+                           int   dest_bpp,
+                           const void *__restrict__ source,
+                           void *__restrict__ destination,
+                           long n)
+{
+        if (source_bpp == BPP_4ASSOCIATED  && dest_bpp == 4)
+        {
+          uint32_t *src = (uint32_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          while (n--)
+          {
+             uint32_t col = *src++;
+             uint8_t *rgba=(uint8_t*)&col;
+             uint8_t oalpha = rgba[3];
+             if (oalpha==0)
+             {
+               *dst++ = 0;
+             }
+             else
+             {
+               uint32_t col_opaque = col;
+               uint8_t *rgbaB=(uint8_t*)&col_opaque;
+               uint32_t ralpha = 0;
+               ralpha = (256*255)/oalpha;
+               rgbaB[0] = (rgba[0]*ralpha)>>8;
+               rgbaB[1] = (rgba[1]*ralpha)>>8;
+               rgbaB[2] = (rgba[2]*ralpha)>>8;
+               rgbaB[3] = 0;
+               *dst++ = lut[col_opaque] | (oalpha<<24);
+             }
+          }
+        }
+        else if (source_bpp == 4 && dest_bpp == 16)
+        {
+          uint32_t *src = (uint32_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          while (n--)
+          {
+             uint32_t col = *src++;
+             uint32_t lut_offset = col & 0xffffff;
+             float alpha = (col>>24)/255.0f;
+
+             *dst++ = lut[lut_offset*4+0];
+             *dst++ = lut[lut_offset*4+1];
+             *dst++ = lut[lut_offset*4+2];
+             ((float*)(dst))[0] = alpha;
+             dst++;
+          }
+        }
+        else if (source_bpp == 4 && dest_bpp == 8)
+        {
+          uint32_t *src = (uint32_t*)source;
+          uint16_t *dst = (uint16_t*)destination;
+          uint16_t *lut16 = (uint16_t*)lut;
+          while (n--)
+          {
+             uint32_t col = *src++;
+             uint32_t lut_offset = col & 0xffffff;
+             uint16_t alpha = (col>>24) << 8; 
+
+             dst[0] = lut16[lut_offset*2+0];
+             dst[1] = lut16[lut_offset*2+1];
+             dst[2] = lut16[lut_offset*2+2];
+             dst[3] = alpha;
+             dst+=4;
+          }
+        }
+        else if (source_bpp == 2 && dest_bpp == 16)
+        {
+          uint16_t *src = (uint16_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          while (n--)
+          {
+             uint32_t col = *src++;
+             *dst++ = lut[col*4+0];
+             *dst++ = lut[col*4+1];
+             *dst++ = lut[col*4+2];
+             *dst++ = lut[col*4+3];
+          }
+        }
+        else if (source_bpp == 4 && dest_bpp == 4)
+        {
+          uint32_t *src = (uint32_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          while (n--)
+          {
+             uint32_t col = *src++;
+             *dst = (col & 0xff000000) | lut[col & 0xffffff];
+             dst++;
+          }
+        }
+        else if (source_bpp == 2 && dest_bpp == 4)
+        {
+          uint16_t *src = (uint16_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          while (n--)
+          {
+            *dst = lut[*src++];
+            dst++;
+          }
+        }
+        else if (source_bpp == 2 && dest_bpp == 2)
+        {
+          uint16_t *src = (uint16_t*)source;
+          uint16_t *dst = (uint16_t*)destination;
+          uint16_t *lut16 = (uint16_t*)lut;
+          while (n--)
+          {
+             *dst = lut16[*src++];
+             dst++;
+          }
+        }
+        else if (source_bpp == 1 && dest_bpp == 4)
+        {
+          uint8_t *src = (uint8_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          while (n--)
+          {
+             *dst = lut[*src++];
+             dst++;
+          }
+        }
+        else if (source_bpp == 3 && dest_bpp == 3)
+        {
+          uint8_t *src = (uint8_t*)source;
+          uint8_t *dst = (uint8_t*)destination;
+          while (n--)
+          {
+             uint32_t col = src[0]*256*256+src[1]*256+src[2];
+             uint32_t val = lut[col];
+             dst[2]=(val >> 16) & 0xff;
+             dst[1]=(val >> 8) & 0xff;
+             dst[0]=val & 0xff;
+             dst+=3;
+             src+=3;
+          }
+        }
+        else if (source_bpp == 3 && dest_bpp == 4)
+        {
+          uint8_t *src = (uint8_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          while (n--)
+          {
+             *dst = lut[src[0]*256*256+src[1]*256+src[2]];
+             dst++;
+             src+=3;
+          }
+        }
+        else
+        {
+          return 0;
+        }
+        return 1;
+}
+
+void babl_test_lut (uint32_t *lut,
+             int   source_bpp,
+             int   dest_bpp,
+             void *__restrict__ source,
+             void *__restrict__ dest,
+             long count);
+void babl_test_lut (uint32_t *lut,
+             int   source_bpp,
+             int   dest_bpp,
+             void *__restrict__ source,
+             void *__restrict__ dest,
+             long count)
+{
+   _do_lut (lut, source_bpp, dest_bpp, source, dest, count);
+}
+
+static inline float lut_timing_for (int source_bpp, int dest_bpp)
+{
+  return timings[source_bpp * 16 + dest_bpp];
+}
+
+static void measure_timings(void)
+{
+   int num_pixels = babl_get_num_path_test_pixels () * 1000;
+   int pairs[][2]={{4,4},{BPP_4ASSOCIATED,4},{4,8},{3,4},{3,3},{2,4},{2,2},{1,4},{2,16},{4,16}};
+   uint32_t *lut = malloc (256 * 256 * 256 * 16);
+   uint32_t *src = malloc (num_pixels * 16);
+   uint32_t *dst = malloc (num_pixels * 16);
+
+   memset (lut, 11, 256 * 256 * 256 *16);
+   memset (src, 12, num_pixels * 16);
+
+   if (getenv ("BABL_LUT_INFO"))
+   {
+      lut_info_level = atoi (getenv ("BABL_LUT_INFO"));
+   }
+   if (getenv ("BABL_LUT_UNUSED_LIMIT"))
+   {
+      lut_unused_minutes_limit = atof (getenv ("BABL_LUT_UNUSED_LIMIT"));
+   }
+
+   LUT_LOG("BABL_LUT_UNUSED_LIMIT=%.1f\n", lut_unused_minutes_limit);
+
+   LUT_LOG("measuring lut timings          \n");
+   for (size_t p = 0; p < sizeof (pairs)/sizeof(pairs[0]);p++)
+   {
+     int source_bpp = pairs[p][0];
+     int dest_bpp = pairs[p][1];
+     long start,end;
+     start = babl_ticks ();
+     babl_test_lut (lut, source_bpp, dest_bpp, src, dst, num_pixels);
+
+     end = babl_ticks ();
+     timings[source_bpp * 16 + dest_bpp] = (end-start)/1000.0;
+       LUT_LOG ("   %ibpp to %ibpp: %.2f\n", source_bpp, dest_bpp,
+          timings[source_bpp * 16 + dest_bpp]
+                     );
+   }
+   free (lut);
+   free (src);
+   free (dst);
+}
+
+static inline void
+process_conversion_path (BablList   *path,
+                         const void *source_buffer,
+                         int         source_bpp,
+                         void       *destination_buffer,
+                         int         dest_bpp,
+                         long        n);
+
+static inline int babl_fish_lut_process_maybe (const Babl *babl,
+                                               const char *source,
+                                               char *destination,
+                                               long        n,
+                                               void       *data)
+{
+     int source_bpp = babl->fish_path.source_bpp;
+     int dest_bpp = babl->fish_path.dest_bpp;
+     uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut;
+ 
+
+     if (BABL_UNLIKELY(!lut && babl->fish.pixels >= 128 * 256))
+     {
+       LUT_LOG("generating LUT for %s to %s\n",
+               babl_get_name (babl->conversion.source),
+               babl_get_name (babl->conversion.destination));
+       if (source_bpp ==4 && dest_bpp == 4)
+       {
+         lut = malloc (256 * 256 * 256 * 4);
+         for (int o = 0; o < 256 * 256 * 256; o++)
+           lut[o] = o | 0xff000000;
+         process_conversion_path (babl->fish_path.conversion_list,
+                                  lut, 4,
+                                  lut, 4,
+                                  256*256*256);
+         for (int o = 0; o < 256 * 256 * 256; o++)
+           lut[o] = lut[o] & 0x00ffffff;
+
+       }
+       else if (source_bpp == 4 && dest_bpp == 16)
+       {
+         uint32_t *temp_lut = malloc (256 * 256 * 256 * 4);
+         lut = malloc (256 * 256 * 256 * 16);
+         for (int o = 0; o < 256 * 256 * 256; o++)
+           temp_lut[o] = o | 0xff000000;
+         process_conversion_path (babl->fish_path.conversion_list,
+                                  temp_lut, 4,
+                                  lut, 16,
+                                  256*256*256);
+         free (temp_lut);
+       }
+       else if (source_bpp == 4 && dest_bpp == 8)
+       {
+         uint32_t *temp_lut = malloc (256 * 256 * 256 * 4);
+         lut = malloc (256 * 256 * 256 * 8);
+         for (int o = 0; o < 256 * 256 * 256; o++)
+           temp_lut[o] = o | 0xff000000;
+         process_conversion_path (babl->fish_path.conversion_list,
+                                  temp_lut, 4,
+                                  lut, 8,
+                                  256*256*256);
+         free (temp_lut);
+       }
+       else if (source_bpp == 3 && dest_bpp == 3)
+       {
+         uint8_t *temp_lut = malloc (256 * 256 * 256 * 3);
+         uint8_t *temp_lut2 = malloc (256 * 256 * 256 * 3);
+         int o = 0;
+         lut = malloc (256 * 256 * 256 * 4);
+         for (int r = 0; r < 256; r++)
+         for (int g = 0; g < 256; g++)
+         for (int b = 0; b < 256; b++, o++)
+         {
+           temp_lut[o*3+0]=r;
+           temp_lut[o*3+1]=g;
+           temp_lut[o*3+2]=b;
+         }
+         process_conversion_path (babl->fish_path.conversion_list,
+                                  temp_lut, 3,
+                                  temp_lut2, 3,
+                                  256*256*256);
+         babl_process (babl_fish (babl_format ("R'G'B' u8"), babl_format ("R'G'B'A u8")),
+                       temp_lut2, lut, 256*256*256);
+         for (int o = 0; o < 256 * 256 * 256; o++)
+           lut[o] = lut[o] & 0x00ffffff;
+         free (temp_lut);
+         free (temp_lut2);
+       }
+       else if (source_bpp == 3 && dest_bpp == 4)
+       {
+         uint8_t *temp_lut = malloc (256 * 256 * 256 * 3);
+         int o = 0;
+         lut = malloc (256 * 256 * 256 * 4);
+         for (int r = 0; r < 256; r++)
+         for (int g = 0; g < 256; g++)
+         for (int b = 0; b < 256; b++, o++)
+         {
+           temp_lut[o*3+0]=r;
+           temp_lut[o*3+1]=g;
+           temp_lut[o*3+2]=b;
+         }
+         process_conversion_path (babl->fish_path.conversion_list,
+                                  temp_lut, 3,
+                                  lut, 4,
+                                  256*256*256);
+         free (temp_lut);
+       }
+       else if (source_bpp == 2 && dest_bpp == 2)
+       {
+         uint16_t *temp_lut = malloc (256 * 256 * 2);
+         lut = malloc (256 * 256 * 4);
+         for (int o = 0; o < 256*256; o++)
+         {
+           temp_lut[o]=o;
+         }
+         process_conversion_path (babl->fish_path.conversion_list,
+                                  temp_lut, 2,
+                                  lut, 2,
+                                  256*256);
+         free (temp_lut);
+       }
+       else if (source_bpp == 2 && dest_bpp == 4)
+       {
+         uint16_t *temp_lut = malloc (256 * 256 * 2);
+         lut = malloc (256 * 256 * 4);
+         for (int o = 0; o < 256*256; o++)
+         {
+           temp_lut[o]=o;
+         }
+         process_conversion_path (babl->fish_path.conversion_list,
+                                  temp_lut, 2,
+                                  lut, 4,
+                                  256*256);
+         free (temp_lut);
+       }
+       else if (source_bpp == 2 && dest_bpp == 16)
+       {
+         uint16_t *temp_lut = malloc (256 * 256 * 2);
+         lut = malloc (256 * 256 * 16);
+         for (int o = 0; o < 256*256; o++)
+         {
+           temp_lut[o]=o;
+         }
+         process_conversion_path (babl->fish_path.conversion_list,
+                                  temp_lut, 2,
+                                  lut, 16,
+                                  256*256);
+         free (temp_lut);
+       }
+       else if (source_bpp == 1 && dest_bpp == 4)
+       {
+         uint8_t *temp_lut = malloc (256);
+         lut = malloc (256 * 4);
+         for (int o = 0; o < 256; o++)
+         {
+           temp_lut[o]=o;
+         }
+         process_conversion_path (babl->fish_path.conversion_list,
+                                  temp_lut, 1,
+                                  lut, 4,
+                                  256);
+         free (temp_lut);
+       }
+
+       if (babl->fish_path.u8_lut == NULL)
+       {
+         (BABL(babl)->fish_path.u8_lut) = lut;
+         // XXX need memory barrier?
+         if ((BABL(babl)->fish_path.u8_lut) != lut)
+         {
+           free (lut);
+           lut = babl->fish_path.u8_lut;
+         }
+       }
+       else
+       {
+         free (lut);
+         lut = babl->fish_path.u8_lut;
+       }
+     }
+
+     if (lut)
+     {
+       if (source_bpp == 4 && 
+           ((babl->conversion.source->format.model->flags &
+           BABL_MODEL_FLAG_ASSOCIATED)!=0))
+         source_bpp = BPP_4ASSOCIATED;
+
+       if (_do_lut (lut, source_bpp, dest_bpp, source, destination, n))
+       {
+         BABL(babl)->fish_path.last_lut_use = babl_ticks ();
+         return 1;
+       }
+     }
+     return 0;
+}
+
+
+
 #define MAX_BUFFER_SIZE            512
-#define ITERATIONS                 4
 
 int   babl_in_fish_path = 0;
 
@@ -100,7 +615,6 @@ _babl_fish_create_name (char       *buf,
 static int max_path_length (void);
 
 static int debug_conversions = 0;
-int _babl_instrument = 0;
 
 double
 _babl_legal_error (void)
@@ -123,11 +637,19 @@ _babl_legal_error (void)
   else
     debug_conversions = 0;
 
-  env = getenv ("BABL_INSTRUMENT");
+  env = getenv ("BABL_LUT");
   if (env && env[0] != '\0')
-    _babl_instrument = 1;
+    enable_lut = atoi(getenv("BABL_LUT"));
   else
-    _babl_instrument = 0;
+    enable_lut = 1;
+
+  { 
+    const uint32_t u32 = 1;
+    if ( *((char*)&u32) == 0)
+    {  /* disable use of LUTs if we are running on big endian */
+       enable_lut = 0;
+    }
+  }
 
   return error;
 }
@@ -145,9 +667,9 @@ max_path_length (void)
   if (env)
     max_length = atoi (env);
   else
-    max_length = 2; /* reducing this number makes finding short fishes much
+    max_length = 3; /* reducing this number makes finding short fishes much
                        faster - even if we lose out on some of the fast
-                       bigger fish, the fishes we can get with a max_length of 2
+                       bigger fish, the fishes we can get with a max_length of 3
                        is actually 5, since we deepen the search to that
                        depth if none are found within two steps in the
                        initial search.
@@ -317,6 +839,9 @@ int
 _babl_fish_path_destroy (void *data)
 {
   Babl *babl=data;
+  if (babl->fish_path.u8_lut)
+    free (babl->fish_path.u8_lut);
+  babl->fish_path.u8_lut = NULL;
   if (babl->fish_path.conversion_list)
     babl_free (babl->fish_path.conversion_list);
   babl->fish_path.conversion_list = NULL;
@@ -406,6 +931,7 @@ alias_conversion (Babl *babl,
                 babl_remodel_with_space (
                       (void*)conv->destination, (void*)space),
                 "linear", conv->function.linear,
+                "data",   conv->data,
                 NULL);
           break;
         case BABL_CONVERSION_PLANAR:
@@ -415,6 +941,7 @@ alias_conversion (Babl *babl,
                 babl_remodel_with_space (
                       (void*)conv->destination, (void*)space),
                 "planar", conv->function.planar,
+                "data",   conv->data,
                 NULL);
           break;
         case BABL_CONVERSION_PLANE:
@@ -424,6 +951,7 @@ alias_conversion (Babl *babl,
                 babl_remodel_with_space (
                       (void*)conv->destination, (void*)space),
                 "plane", conv->function.plane,
+                "data",  conv->data,
                 NULL);
           break;
         default:
@@ -469,6 +997,92 @@ _babl_fish_prepare_bpp (Babl *babl)
        default:
          babl_log ("-eeek{%i}\n", babl_dest->instance.class_type - BABL_MAGIC);
      }
+
+  if (enable_lut)
+  {
+  int         source_bpp  = babl->fish_path.source_bpp;
+  int         dest_bpp    = babl->fish_path.dest_bpp;
+  const Babl *source_type = babl_format_get_type (babl_source,
+                                                  babl_format_get_n_components (babl_source) - 1);
+  const Babl *dest_type   = babl_format_get_type (babl_dest,
+                                                  babl_format_get_n_components (babl_dest) - 1);
+
+  int src_not_associated = ((babl->conversion.source->format.model->flags &
+          BABL_MODEL_FLAG_ASSOCIATED)==0);
+  int dest_not_associated = ((babl->conversion.destination->format.model->flags &
+          BABL_MODEL_FLAG_ASSOCIATED)==0);
+  if (
+      (babl->conversion.source->format.type[0]->bits < 32)       
+
+      && (  (   source_bpp == 2
+             && dest_bpp   == 16)
+
+          ||(   source_bpp  == 4
+             && dest_bpp    == 16
+             && source_type == babl_type_from_id (BABL_U8)
+             && dest_type   == babl_type_from_id (BABL_FLOAT)
+             && src_not_associated
+             && dest_not_associated)
+
+          ||(   source_bpp == 4
+             && dest_bpp   == 4
+             && dest_type  == source_type
+             && dest_not_associated)
+
+          ||(   source_bpp  == 4
+             && dest_bpp    == 8
+             && source_type == babl_type_from_id (BABL_U8)
+             && dest_type   == babl_type_from_id (BABL_U16)
+             && src_not_associated
+             && dest_not_associated)
+
+          ||(   source_bpp == 3
+             && dest_bpp   == 4)
+
+          ||(   source_bpp == 2
+             && dest_bpp   == 4)
+
+          ||(   source_bpp == 2
+             && dest_bpp   == 2)
+
+          ||(   source_bpp == 1
+             && dest_bpp   == 4)
+
+          ||(   source_bpp == 3
+             && dest_bpp   == 3)
+      )
+     )
+  {
+     // as long as the highest 8bit of the 32bit of a 4 byte input is ignored
+     // (alpha) - and it is not an associated color model. A 24 bit LUT provides
+     // exact data. 
+     // Note that we can only copy alpha from source to complete when
+     // types are matching expectations - the source_bpp/dest_bpp pairs have
+     // currently have built-in expectation for what type alpha is filled in
+     {
+       static int measured_timings = 0;
+       float scaling = 10.0;
+       if (!measured_timings) measure_timings ();
+       measured_timings = 1;
+       LUT_LOG ("%sLUT for %s to %s   %.2f%s%.2f\n",
+
+       ((lut_timing_for (source_bpp, dest_bpp) * scaling) <
+                           babl->fish_path.cost)?"possible ":"no ",
+
+                        babl_get_name (babl->conversion.source),
+                        babl_get_name (babl->conversion.destination),
+                        (lut_timing_for (source_bpp, dest_bpp) * scaling),
+       ((lut_timing_for (source_bpp, dest_bpp) * scaling) <
+                           babl->fish_path.cost)?" < ":" > ",
+                        babl->fish_path.cost);
+       if ((lut_timing_for (source_bpp, dest_bpp) * scaling) <
+                           babl->fish_path.cost)
+       {
+         babl->fish_path.is_u8_color_conv = 1;
+       }
+     }
+  }
+  }
 }
 
 void
@@ -549,6 +1163,7 @@ babl_fish_path2 (const Babl *source,
     static const Babl *run_once[512]={NULL};
     int i;
     int done = 0;
+
     for (i = 0; run_once[i]; i++)
     {
       if (run_once[i] == source->format.space)
@@ -579,7 +1194,6 @@ babl_fish_path2 (const Babl *source,
     {
       babl_conversion_class_for_each (show_item, (void*)source->format.space);
     }
-
   }
 
   babl = babl_calloc (1, sizeof (BablFishPath) +
@@ -598,10 +1212,11 @@ babl_fish_path2 (const Babl *source,
   babl->fish_path.conversion_list = babl_list_init_with_size (BABL_HARD_MAX_PATH_LENGTH);
 
 
+
   {
     PathContext pc;
     int start_depth = max_path_length ();
-    int end_depth = start_depth + 2 + ((destination->format.space != sRGB)?1:0);
+    int end_depth = start_depth + 1 + ((destination->format.space != sRGB)?1:0);
     end_depth = MIN(end_depth, BABL_HARD_MAX_PATH_LENGTH);
 
     pc.current_path = babl_list_init_with_size (BABL_HARD_MAX_PATH_LENGTH);
@@ -649,6 +1264,7 @@ babl_fish_path2 (const Babl *source,
     }
 
   _babl_fish_prepare_bpp (babl);
+
   _babl_fish_rig_dispatch (babl);
   /* Since there is not an already registered instance by the required
    * name, inserting newly created class into database.
@@ -701,6 +1317,18 @@ babl_fish_path_process (const Babl *babl,
                         long        n,
                         void       *data)
 {
+  BABL(babl)->fish.pixels += n;
+  if (babl->fish_path.is_u8_color_conv)
+  {
+     if (babl_fish_lut_process_maybe (babl,
+                                      source, destination, n,
+                                      data))
+       return;
+  }
+  else
+  {
+    babl_conv_counter+=n;
+  }
   process_conversion_path (babl->fish_path.conversion_list,
                            source,
                            babl->fish_path.source_bpp,
@@ -787,8 +1415,6 @@ _babl_process (const Babl *cbabl,
 {
   Babl *babl = (void*)cbabl;
   babl->fish.dispatch (babl, source, destination, n, *babl->fish.data);
-  if (_babl_instrument)
-    babl->fish.pixels += n;
   return n;
 }
 
@@ -820,8 +1446,6 @@ babl_process_rows (const Babl *fish,
   if (n <= 0)
     return 0;
 
-  if (_babl_instrument)
-    babl->fish.pixels += n * rows;
   for (row = 0; row < rows; row++)
     {
       babl->fish.dispatch (babl, (void*)src, (void*)dst, n, *babl->fish.data);
@@ -835,7 +1459,7 @@ babl_process_rows (const Babl *fish,
 #include <stdint.h>
 
 #define BABL_ALIGN 16
-static void inline *align_16 (unsigned char *ret)
+static inline void *align_16 (unsigned char *ret)
 {
   int offset = BABL_ALIGN - ((uintptr_t) ret) % BABL_ALIGN;
   ret = ret + offset;
diff --git a/babl/babl-fish-reference.c b/babl/babl-fish-reference.c
index a62f32a..2725e3e 100644
--- a/babl/babl-fish-reference.c
+++ b/babl/babl-fish-reference.c
@@ -358,7 +358,6 @@ ncomponent_convert_from_double (BablFormat *destination_fmt,
   src_img->stride[0] = 0;
 
   dst_img->data[0]  = destination_buf;
-  dst_img->type[0]  = (BablType *) babl_type_from_id (BABL_DOUBLE);
   dst_img->pitch[0] = destination_fmt->type[0]->bits/8;
   dst_img->stride[0] = 0;
 
@@ -480,7 +479,6 @@ ncomponent_convert_from_float (BablFormat *source_fmt,
   src_img->stride[0] = 0;
 
   dst_img->data[0]  = destination_buf;
-  dst_img->type[0]  = (BablType *) babl_type_from_id (BABL_FLOAT);
   dst_img->pitch[0] = destination_fmt->type[0]->bits/8;
   dst_img->stride[0] = 0;
 
diff --git a/babl/babl-fish.c b/babl/babl-fish.c
index ce22b6b..06d7961 100644
--- a/babl/babl-fish.c
+++ b/babl/babl-fish.c
@@ -253,6 +253,7 @@ babl_fish (const void *source,
          * we will search through the fish database for reference fish
          * to handle the memcpy */
         babl_hash_table_find (id_htable, hashval, find_memcpy_fish, (void *) &ffish);
+        babl_mutex_lock (babl_fish_mutex);
       }
     else
       {
@@ -359,4 +360,10 @@ babl_fish (const void *source,
   }
 }
 
+
+BablFishProcess babl_fish_get_process (const Babl *babl)
+{
+  return babl->fish.dispatch;
+}
+
 BABL_CLASS_MINIMAL_IMPLEMENT (fish);
diff --git a/babl/babl-fish.h b/babl/babl-fish.h
index 35382f0..0ad9101 100644
--- a/babl/babl-fish.h
+++ b/babl/babl-fish.h
@@ -69,6 +69,9 @@ typedef struct
   double     cost;   /* number of  ticks *10 + chain_length */
   int        source_bpp;
   int        dest_bpp;
+  unsigned int is_u8_color_conv:1; // keep track of count, and make 
+  uint32_t  *u8_lut;
+  long       last_lut_use;
   BablList  *conversion_list;
 } BablFishPath;
 
diff --git a/babl/babl-format.c b/babl/babl-format.c
index 2e4e3d7..cbcc880 100644
--- a/babl/babl-format.c
+++ b/babl/babl-format.c
@@ -140,8 +140,9 @@ format_new_from_format_with_space (const Babl *format,
 {
   Babl *ret;
   char new_name[256];
-  snprintf (new_name, sizeof (new_name), "%s-%s", babl_get_name ((void*)format),
+  snprintf (new_name, sizeof (new_name)-1, "%s-%s", babl_get_name ((void*)format),
                                                   babl_get_name ((void*)space));
+  new_name[255]=0;
   ret = babl_db_find (babl_format_db(), new_name);
   if (ret)
     return ret;
@@ -346,7 +347,7 @@ babl_format_new (const void *first_arg,
   int            components = 0;
   BablModel     *model      = NULL;
   const Babl    *space      = babl_space ("sRGB");
-  const char    *doc        = NULL;
+  char          *doc        = NULL;
   BablComponent *component [BABL_MAX_COMPONENTS];
   BablSampling  *sampling  [BABL_MAX_COMPONENTS];
   const BablType*type      [BABL_MAX_COMPONENTS];
@@ -468,16 +469,18 @@ babl_format_new (const void *first_arg,
   va_end (varg);
 
   if (!name)
-    name = create_name (model, components, component, type);
+    {
+      name = create_name (model, components, component, type);
 
-  if (space != babl_space ("sRGB"))
-  {
-    char *new_name = babl_malloc (strlen (name) +
-                                  strlen (babl_get_name ((Babl*)space)) + 1);
-    sprintf (new_name, "%s-%s", name, babl_get_name ((Babl*)space));
-    babl_free (name);
-    name = new_name;
-  }
+      if (space != babl_space ("sRGB"))
+        {
+          char *new_name = babl_malloc (strlen (name) +
+                                        strlen (babl_get_name ((Babl*)space)) + 1);
+          sprintf (new_name, "%s-%s", name, babl_get_name ((Babl*)space));
+          babl_free (name);
+          name = new_name;
+        }
+    }
 
   if (!model)
     {
@@ -508,6 +511,7 @@ babl_format_new (const void *first_arg,
                     "with different content!", name);
 
       babl_free (name);
+      babl_free (doc);
       return babl;
     }
 
diff --git a/babl/babl-icc.c b/babl/babl-icc.c
index a21888f..38e382a 100644
--- a/babl/babl-icc.c
+++ b/babl/babl-icc.c
@@ -322,6 +322,11 @@ read_sign (ICC *state,
            int  offset)
 {
   sign_t ret;
+  if (offset < 0 || offset > state->length - 4)
+  {
+    for (int i = 0; i < 5; i ++) ret.str[0]=0;
+    return ret;
+  }
   ret.str[0]=icc_read (u8, offset);
   ret.str[1]=icc_read (u8, offset + 1);
   ret.str[2]=icc_read (u8, offset + 2);
@@ -356,10 +361,23 @@ icc_tag (ICC        *state,
      sign_t sign = icc_read (sign, TAG_COUNT_OFF + 4 + 12 * t);
      if (!strcmp (sign.str, tag))
      {
+        int off = icc_read (u32, TAG_COUNT_OFF + 4 + 12* t + 4);
+        int len = icc_read (u32, TAG_COUNT_OFF + 4 + 12* t + 4*2);
+
+        if (off + len > state->length || off < 0)
+        {
+          if (offset)
+            *offset = 0;
+          if (el_length)
+            *el_length = 0;
+           return 0; // broken input
+        }
+
         if (offset)
-          *offset = icc_read (u32, TAG_COUNT_OFF + 4 + 12* t + 4);
+          *offset = off;
         if (el_length)
-          *el_length = icc_read (u32, TAG_COUNT_OFF + 4 + 12* t + 4*2);
+          *el_length = len;
+
         return 1;
      }
   }
@@ -384,18 +402,36 @@ babl_trc_from_icc (ICC         *state,
               g = icc_read (s15f16, offset + 12 + 4 * 0);
               return babl_trc_gamma (g);
               break;
+            case 1:
+              {
+                float a,b,c;
+                g = icc_read (s15f16, offset + 12 + 4 * 0);
+                a = icc_read (s15f16, offset + 12 + 4 * 1);
+                b = icc_read (s15f16, offset + 12 + 4 * 2);
+                c = 0;
+                return babl_trc_formula_cie (g, a, b, c);
+              }
+            case 2:
+              {
+                float a,b,c;
+                g = icc_read (s15f16, offset + 12 + 4 * 0);
+                a = icc_read (s15f16, offset + 12 + 4 * 1);
+                b = icc_read (s15f16, offset + 12 + 4 * 2);
+                c = icc_read (s15f16, offset + 12 + 4 * 3);
+                return babl_trc_formula_cie (g, a, b, c);
+              }
             case 3:
               {
-                float a,b,c,d;
+                float a,b,c,d,e,f;
                 g = icc_read (s15f16, offset + 12 + 4 * 0);
                 a = icc_read (s15f16, offset + 12 + 4 * 1);
                 b = icc_read (s15f16, offset + 12 + 4 * 2);
                 c = icc_read (s15f16, offset + 12 + 4 * 3);
                 d = icc_read (s15f16, offset + 12 + 4 * 4);
-                //fprintf (stderr, "%f %f %f %f %f\n", g, a, b, c, d);
-                return babl_trc_formula_srgb (g, a, b, c, d);
+                e = 0.0f;
+                f = 0.0f;
+                return babl_trc_formula_srgb (g, a, b, c, d, e, f);
               }
-              break;
             case 4:
               {
                 float a,b,c,d,e,f;
@@ -406,15 +442,8 @@ babl_trc_from_icc (ICC         *state,
                 d = icc_read (s15f16, offset + 12 + 4 * 4);
                 e = icc_read (s15f16, offset + 12 + 4 * 5);
                 f = icc_read (s15f16, offset + 12 + 4 * 6);
-                fprintf (stderr, "%f %f %f %f %f %f %f\n",
-                              g, a, b, c, d, e, f);
-            {
-              fprintf (stderr, "unhandled parametric sRGB formula TRC type %i\n", function_type);
-              *error = "unhandled sRGB formula like TRC";
-              return babl_trc_gamma (2.2);
-            }
-                              }
-              break;
+                return babl_trc_formula_srgb (g, a, b, c, d, e, f);
+              }
             default:
               *error = "unhandled parametric TRC";
               fprintf (stderr, "unhandled parametric TRC type %i\n", function_type);
@@ -539,6 +568,8 @@ switch (trc->type)
       break;
     }
   case BABL_TRC_FORMULA_SRGB:
+  // fall through
+  case BABL_TRC_FORMULA_CIE:
     {
       int lut_size = 512;
       if (flags == BABL_ICC_COMPACT_TRC_LUT)
@@ -676,7 +707,6 @@ babl_space_to_icc_rgb (const Babl  *babl,
         icc_write (u8, state->o + 12 + i, description[i]);
     }
 
-
     icc_write (u32, 0, state->no + 0);
     length = state->no + 0;
   }
@@ -941,6 +971,8 @@ babl_space_from_icc (const char   *icc_data,
 
   sign_t profile_class, color_space, pcs;
 
+  babl_mutex_lock (babl_space_mutex);
+
   if (!error) error = &int_err;
   *error = NULL;
 
@@ -956,12 +988,23 @@ babl_space_from_icc (const char   *icc_data,
     if (!strcmp (color_space.str, "CMYK"))
     {
        ret = _babl_space_for_lcms (icc_data, icc_length);
+       if (!ret)
+       {
+         babl_mutex_unlock (babl_space_mutex);
+         return NULL;
+       }
        if (ret->space.icc_type == BablICCTypeCMYK)
+       {
+         babl_mutex_unlock (babl_space_mutex);
          return ret;
+       }
        ret->space.icc_length = icc_length;
        ret->space.icc_profile = malloc (icc_length);
        if (!ret->space.icc_profile)
+       {
+         babl_mutex_unlock (babl_space_mutex);
          return NULL;
+       }
        memcpy (ret->space.icc_profile, icc_data, icc_length);
 
 #ifdef HAVE_LCMS
@@ -991,25 +1034,24 @@ babl_space_from_icc (const char   *icc_data,
                                                     //  INTENT_PERCEPTUAL,0);//intent & 7, 0);
        cmsCloseProfile (ret->space.cmyk.lcms_profile); // XXX keep it open in case of CMYK to CMYK transforms needed?
 #endif
+       ret->space.icc_type = BablICCTypeCMYK;
+       babl_mutex_unlock (babl_space_mutex);
        return ret;
     }
 
-
-
-
-    if (strcmp (color_space.str, "RGB ")
-        && strcmp (color_space.str, "GRAY")
-    )
+    if (!(!strcmp (color_space.str, "RGB ")||
+          !strcmp (color_space.str, "GRAY")))
     {
       *error = "not defining RGB, CMYK or GRAY space..";
     }
     else
-     {
-       if (strcmp (profile_class.str, "mntr"))
-         *error = "not a monitor-class profile";
-       if (!strcmp (color_space.str, "GRAY"))
-         is_gray = 1;
-     }
+    {
+      if (!(!strcmp (profile_class.str, "mntr")||
+            !strcmp (profile_class.str, "scnr")))
+         *error = "not a display or input-class profile";
+      if (!strcmp (color_space.str, "GRAY"))
+        is_gray = 1;
+    }
   }
 
   if (!*error)
@@ -1051,10 +1093,10 @@ babl_space_from_icc (const char   *icc_data,
       }
       break;
     case BABL_ICC_INTENT_ABSOLUTE_COLORIMETRIC:
-      *error = "absolute colormetric not implemented";
+      *error = "absolute colorimetric not implemented";
       break;
     case BABL_ICC_INTENT_SATURATION:
-      *error = "absolute stauration not supported";
+      *error = "saturation not supported";
       break;
   }
 
@@ -1097,6 +1139,7 @@ babl_space_from_icc (const char   *icc_data,
   {
 
     babl_free (state);
+    babl_mutex_unlock (babl_space_mutex);
     return NULL;
   }
 
@@ -1114,6 +1157,7 @@ babl_space_from_icc (const char   *icc_data,
     ret->space.icc_profile = malloc (icc_length);
     memcpy (ret->space.icc_profile, icc_data, icc_length);
     babl_free (state);
+    babl_mutex_unlock (babl_space_mutex);
     return ret;
 
 
@@ -1159,6 +1203,7 @@ babl_space_from_icc (const char   *icc_data,
            *error = "Inconsistent ICC profile detected, profile contains both cLUTs and a matrix with swapped primaries, this likely means it is an intentionally inconsistent Argyll profile is in use; this profile is only capable of high accuracy rendering and does not permit acceleration for interactive previews.";
            fprintf (stderr, "babl ICC warning: %s\n", *error);
            babl_free (state);
+           babl_mutex_unlock (babl_space_mutex);
            return NULL;
         }
       }
@@ -1168,6 +1213,7 @@ babl_space_from_icc (const char   *icc_data,
      if (ret)
      {
         babl_free (state);
+        babl_mutex_unlock (babl_space_mutex);
         return ret;
      }
 
@@ -1180,9 +1226,10 @@ babl_space_from_icc (const char   *icc_data,
                 trc_red, trc_green, trc_blue);
 
        babl_free (state);
-       ret->space.icc_length = icc_length;
+       ret->space.icc_length  = icc_length;
        ret->space.icc_profile = malloc (icc_length);
        memcpy (ret->space.icc_profile, icc_data, icc_length);
+       babl_mutex_unlock (babl_space_mutex);
        return ret;
      }
   }
@@ -1200,11 +1247,13 @@ babl_space_from_icc (const char   *icc_data,
      if (phosporant != 0)
      {
        *error = "unhandled phosporants, please report bug against babl with profile";
+       babl_mutex_unlock (babl_space_mutex);
        return NULL;
      }
      if (channels != 3)
      {
        *error = "unexpected non 3 count of channels";
+       babl_mutex_unlock (babl_space_mutex);
        return NULL;
      }
 
@@ -1234,6 +1283,7 @@ babl_space_from_icc (const char   *icc_data,
        ret->space.icc_profile = malloc (icc_length);
        memcpy (ret->space.icc_profile, icc_data, icc_length);
 
+       babl_mutex_unlock (babl_space_mutex);
        return ret;
      }
   }
@@ -1241,6 +1291,7 @@ babl_space_from_icc (const char   *icc_data,
   }
 
   babl_free (state);
+  babl_mutex_unlock (babl_space_mutex);
   return NULL;
 }
 
@@ -1467,8 +1518,11 @@ ConvertUTF16toUTF8 (const UTF16   **sourceStart,
 	}
 	switch (bytesToWrite) { /* note: everything falls through. */
 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+	    // fall through
 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+	    // fall through
 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+	    // fall through
 	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
 	}
 	target += bytesToWrite;
diff --git a/babl/babl-internal.c b/babl/babl-internal.c
index f7939a1..072127a 100644
--- a/babl/babl-internal.c
+++ b/babl/babl-internal.c
@@ -84,6 +84,8 @@ BablMutex *babl_format_mutex;
 BablMutex *babl_debug_mutex;
 #endif
 BablMutex *babl_reference_mutex;
+BablMutex *babl_space_mutex;
+BablMutex *babl_remodel_mutex;
 
 void
 babl_internal_init (void)
@@ -93,6 +95,8 @@ babl_internal_init (void)
   babl_fish_mutex = babl_mutex_new ();
   babl_format_mutex = babl_mutex_new ();
   babl_reference_mutex = babl_mutex_new ();
+  babl_space_mutex = babl_mutex_new ();
+  babl_remodel_mutex = babl_mutex_new ();
 #if BABL_DEBUG_MEM
   babl_debug_mutex = babl_mutex_new ();
 #endif
diff --git a/babl/babl-internal.h b/babl/babl-internal.h
index b3c9785..38c4db5 100644
--- a/babl/babl-internal.h
+++ b/babl/babl-internal.h
@@ -249,6 +249,8 @@ extern int   babl_in_fish_path;
 extern BablMutex *babl_format_mutex;
 extern BablMutex *babl_fish_mutex;
 extern BablMutex *babl_reference_mutex;
+extern BablMutex *babl_space_mutex;
+extern BablMutex *babl_remodel_mutex;
 
 #define BABL_DEBUG_MEM 0
 #if BABL_DEBUG_MEM
@@ -362,13 +364,16 @@ void babl_store_db (void);
 int _babl_max_path_len (void);
 
 
-const Babl *
-babl_trc_new (const char *name,
+extern const Babl *
+(*babl_trc_new) (const char *name,
               BablTRCType type,
               double      gamma,
               int         n_lut,
               float      *lut);
 
+extern const Babl *
+(*babl_trc_lookup_by_name) (const char *name);
+
 void babl_space_to_xyz   (const Babl *space, const double *rgb, double *xyz);
 void babl_space_from_xyz (const Babl *space, const double *xyz, double *rgb);
 
@@ -383,9 +388,11 @@ const char *
 babl_conversion_create_name (Babl *source, Babl *destination, int type,
                              int allow_collision);
 
-void _babl_space_add_universal_rgb (const Babl *space);
+extern void (*_babl_space_add_universal_rgb) (const Babl *space);
+const Babl *
+babl_trc_formula_srgb (double gamma, double a, double b, double c, double d, double e, double f);
 const Babl *
-babl_trc_formula_srgb (double gamma, double a, double b, double c, double d);
+babl_trc_formula_cie (double gamma, double a, double b, double c);
 
 
 const Babl *babl_space_match_trc_matrix (const Babl *trc_red,
@@ -424,8 +431,6 @@ void babl_space_to_xyz   (const Babl *space, const double *rgb, double *xyz);
  */
 void babl_space_from_xyz (const Babl *space, const double *xyz, double *rgb);
 
-extern int _babl_instrument;
-
 static inline void
 babl_conversion_process (const Babl *babl,
                          const char *source,
@@ -433,8 +438,6 @@ babl_conversion_process (const Babl *babl,
                          long        n)
 {
   BablConversion *conversion = (BablConversion *) babl;
-  if (_babl_instrument)
-    conversion->pixels += n;
   conversion->dispatch (babl, source, destination, n, conversion->data);
 }
 
@@ -465,4 +468,7 @@ char *babl_space_to_icc (const Babl  *space,
 Babl *
 _babl_space_for_lcms (const char *icc_data, int icc_length); // XXX pass profile for dedup?
 
+void
+babl_trc_class_init (void);
+
 #endif
diff --git a/babl/babl-introspect.c b/babl/babl-introspect.c
index 6230f92..00168cc 100644
--- a/babl/babl-introspect.c
+++ b/babl/babl-introspect.c
@@ -68,7 +68,7 @@ babl_introspect (Babl *babl)
   babl_conversion_class_for_each (each_introspect, NULL);
   babl_log ("");
   babl_log ("trcs:");
-  babl_trc_class_for_each (each_introspect, NULL);
+  //babl_trc_class_for_each (each_introspect, NULL);
   babl_log ("");
   babl_log ("spaces:");
   babl_space_class_for_each (each_introspect, NULL);
diff --git a/babl/babl-matrix.h b/babl/babl-matrix.h
index 2107b31..714debc 100644
--- a/babl/babl-matrix.h
+++ b/babl/babl-matrix.h
@@ -9,7 +9,7 @@ static inline void babl_matrix_mul_matrix (const double *matA_,
                                            const double *matB_,
                                            double *out)
 {
-  int i, j;
+  unsigned int i, j;
   double matA[9];
   double matB[9];
   double t1, t2, t3;
@@ -36,7 +36,7 @@ static inline void babl_matrix_mul_matrixf (const float *matA_,
                                             const float *matB_,
                                             float *out)
 {
-  int i, j;
+  unsigned int i, j;
   float matA[9];
   float matB[9];
   float t1, t2, t3;
@@ -60,7 +60,7 @@ static inline void babl_matrix_mul_matrixf (const float *matA_,
 
 static inline void babl_matrix_to_float (const double *in, float *out)
 {
-  int i;
+  unsigned int i;
   for (i = 0; i < 9; i ++)
     out[i] = in[i];
 }
@@ -141,9 +141,9 @@ static inline void babl_matrix_mul_vectorff (const float *mat, const float *v_in
 }
 
 static inline void babl_matrix_mul_vectorff_buf3 (const float *mat, const float *v_in, float *v_out,
-                                                  int samples)
+                                                  unsigned int samples)
 {
-  int i;
+  unsigned int i;
   const float m_0_0 = m(mat, 0, 0);
   const float m_0_1 = m(mat, 0, 1);
   const float m_0_2 = m(mat, 0, 2);
@@ -166,7 +166,7 @@ static inline void babl_matrix_mul_vectorff_buf3 (const float *mat, const float
 }
 
 static inline void babl_matrix_mul_vectorff_buf4 (const float *mat, const float *v_in, float *v_out,
-                                                  int samples)
+                                                  unsigned int samples)
 {
   const float m_0_0 = m(mat, 0, 0);
   const float m_0_1 = m(mat, 0, 1);
@@ -177,7 +177,7 @@ static inline void babl_matrix_mul_vectorff_buf4 (const float *mat, const float
   const float m_2_0 = m(mat, 2, 0);
   const float m_2_1 = m(mat, 2, 1);
   const float m_2_2 = m(mat, 2, 2);
-  int i;
+  unsigned int i;
   for (i = 0; i < samples; i ++)
   {
     float a = v_in[0], b = v_in[1], c = v_in[2];
@@ -192,9 +192,9 @@ static inline void babl_matrix_mul_vectorff_buf4 (const float *mat, const float
 }
 
 static inline void babl_matrix_mul_vector_buf4 (const double *mat, const double *v_in, double *v_out,
-                                                int samples)
+                                                unsigned int samples)
 {
-  int i;
+  unsigned int i;
   const double m_0_0 = m(mat, 0, 0);
   const double m_0_1 = m(mat, 0, 1);
   const double m_0_2 = m(mat, 0, 2);
diff --git a/babl/babl-memory.c b/babl/babl-memory.c
index 5c9214e..b7d83ef 100644
--- a/babl/babl-memory.c
+++ b/babl/babl-memory.c
@@ -318,8 +318,8 @@ babl_strcat (char       *dest,
              const char *src)
 {
   char *ret;
-  int   src_len;
-  int   dst_len;
+  size_t src_len;
+  size_t dst_len;
 
   if (NULL == src)
     return dest;
diff --git a/babl/babl-model.c b/babl/babl-model.c
index 44481ac..a2ec53a 100644
--- a/babl/babl-model.c
+++ b/babl/babl-model.c
@@ -382,8 +382,9 @@ babl_model_is_symmetric (const Babl *cbabl)
         for (j = 0; j < 4; j++)
         {
           float tolerance = TOLERANCE;
+          /* this to adapt to value ranges outside 0-1 */
           if (fabs(clipped[i*4+j]) > 1.0)
-            tolerance = fabs(clipped[i*4+j]) * 0.01;
+             tolerance = fabs(clipped[i*4+j]) * TOLERANCE;
           if (fabs (clipped[i *4 + j] - transformed[i * 4 + j]) > tolerance)
             {
               if (!log)
@@ -420,11 +421,9 @@ babl_model_is_symmetric (const Babl *cbabl)
 
 BABL_CLASS_IMPLEMENT (model)
 
-/* XXX: probably better to do like with babl_format, add a -suffix and
- *      insert in normal database than to have this static cache list
- */
-static const Babl *babl_remodels[512]={NULL,};
-int          babl_n_remodels = 0;
+static Babl **babl_remodels = NULL;
+static int    babl_remodel_size = 0;
+static int    babl_n_remodels = 0;
 
 const Babl *
 babl_remodel_with_space (const Babl *model, 
@@ -453,24 +452,37 @@ babl_remodel_with_space (const Babl *model,
 
   assert (BABL_IS_BABL (model));
 
+  babl_mutex_lock (babl_remodel_mutex);
   /* get back to the sRGB model if we are in a COW clone of it  */
   if (model->model.model)
     model = (void*)model->model.model;
 
   assert (BABL_IS_BABL (model));
+  if (babl_remodel_size < babl_n_remodels + 2)
+  {
+    int new_size = (babl_n_remodels + 2) * 2;
+    if (new_size < 256) new_size = 256;
+    babl_remodels = babl_realloc (babl_remodels, new_size * sizeof (Babl*));
+    babl_remodel_size = new_size;
+  }
 
   for (i = 0; i < babl_n_remodels; i++)
   {
     if (babl_remodels[i]->model.model == model &&
         babl_remodels[i]->model.space == space)
-          return babl_remodels[i];
+        {
+          ret = (Babl*)babl_remodels[i];
+          babl_mutex_unlock (babl_remodel_mutex);
+          return ret;
+       }
   }
 
   ret = babl_calloc (sizeof (BablModel), 1);
   memcpy (ret, model, sizeof (BablModel));
   ret->model.space = space;
   ret->model.model = (void*)model; /* use the data as a backpointer to original model */
-  return babl_remodels[babl_n_remodels++] = ret;
+  babl_remodels[babl_n_remodels++] = ret;
+  babl_mutex_unlock (babl_remodel_mutex);
   return (Babl*)ret;
 }
 
diff --git a/babl/babl-palette.c b/babl/babl-palette.c
index 086da67..1db8fac 100644
--- a/babl/babl-palette.c
+++ b/babl/babl-palette.c
@@ -831,6 +831,10 @@ babl_new_palette_with_space (const char  *name,
   *palptr = default_palette ();;
   cname[0] = 'v';
   model_no_alpha = babl_model_new ("name", name, component, NULL);
+
+  babl_set_user_data (model, palptr);
+  babl_set_user_data (model_no_alpha, palptr);
+
   cname[0] = '\\';
   f_pal_a_u8 = (void*) babl_format_new ("name", name, model, space,
                                 babl_type ("u8"),
@@ -924,9 +928,6 @@ babl_new_palette_with_space (const char  *name,
      "data", palptr,
      NULL);
 
-  babl_set_user_data (model, palptr);
-  babl_set_user_data (model_no_alpha, palptr);
-
   if (format_u8)
     *format_u8 = f_pal_u8;
   if (format_u8_with_alpha)
diff --git a/babl/babl-ref-pixels.inc b/babl/babl-ref-pixels.inc
index b706da7..9479d88 100644
--- a/babl/babl-ref-pixels.inc
+++ b/babl/babl-ref-pixels.inc
@@ -25,13 +25,13 @@ static const double babl_path_test_pixels[12288] = {
 0.7718567535150, 0.7514902249684, 0.9248924855724, 0.0955070830395,
 0.1025725035475, 0.4075078840402, 0.6785853950673, 0.9634626577438,
 0.9553271694832, 0.1067870907051, 0.3603431430461, 0.5144157584358,
-0.2521084366609, 0.1242231042703, 0.2321393006631, 0.7137176574737,
+-0.2521084366609, 0.1242231042703, 0.2321393006631, 0.7137176574737,
 0.7002526473721, 0.6853307782138, 0.3886525502376, 0.9672617432509,
 0.4827699570371, 0.5637687419372, 0.9882396813427, 0.5632248975165,
-0.6371306197891, 0.7984740379259, 0.1965256124719, 0.5864613962297,
+0.6371306197891, 0.7984740379259, 1.1965256124719, 0.5864613962297,
 0.8423190181341, 0.2221470797538, 0.1378166187265, 0.7468119350946,
 0.0261717611114, 0.6141757711834, 0.9736373051878, 0.0627091038333,
-0.1666384582252, 0.1287442646589, 0.0216836552237, 0.6522226997894,
+0.1666384582252, 0.1287442646589, -0.0216836552237, 0.6522226997894,
 0.0957444552778, 0.1219656272428, 0.2355313558297, 0.3820267982697,
 0.4349323992780, 0.3478528919387, 0.2461887315131, 0.4676706564928,
 0.1980661788015, 0.1351850461844, 0.0331836701525, 0.6348412817506,
@@ -39,7 +39,7 @@ static const double babl_path_test_pixels[12288] = {
 0.6422913356881, 0.2450153670483, 0.4793101737645, 0.8954794010592,
 0.6798358967900, 0.4846103533565, 0.4671624468021, 0.6171267924910,
 0.0930224513137, 0.7060076579014, 0.0987861240743, 0.4407997515243,
-0.5024965775677, 0.2596609095389, 0.8347519230259, 0.1204697792979,
+0.5024965775677, 0.2596609095389, 2.8347519230259, 0.1204697792979,
 0.5379539348827, 0.5982410328455, 0.3816265367817, 0.0702832783900,
 0.2626565495798, 0.9728863341608, 0.9460939252498, 0.6278152682948,
 0.0007009459663, 0.4607227283813, 0.1080713798795, 0.9792775954023,
@@ -3096,7 +3096,7 @@ static const double babl_path_test_pixels[12288] = {
 0.5395579815561, 0.2107098112864, 0.5031076616156, 0.4885859840962,
 };
 
-static const int babl_num_conversion_test_pixels = 128;
+static const int babl_num_conversion_test_pixels = 32;
 
 static const double *babl_conversion_test_pixels = babl_path_test_pixels;
 
diff --git a/babl/babl-space.c b/babl/babl-space.c
index c662629..56570b5 100644
--- a/babl/babl-space.c
+++ b/babl/babl-space.c
@@ -25,9 +25,9 @@
 
 static BablSpace space_db[MAX_SPACES];
 
-static void babl_chromatic_adaptation_matrix (const double *whitepoint,
-                                              const double *target_whitepoint,
-                                              double       *chad_matrix)
+void babl_chromatic_adaptation_matrix (const double *whitepoint,
+                                       const double *target_whitepoint,
+                                       double       *chad_matrix)
 {
   double bradford[9]={ 0.8951000, 0.2664000, -0.1614000,
                       -0.7502000, 1.7135000,  0.0367000,
@@ -93,6 +93,36 @@ XYZ_to_LAB (double X,
   *to_b = 200.0 * (f_y - f_z);
 }
 
+// cached equalized matrices generated for spaces used internally by babl
+//
+static double equalized_matrices[][9]=
+{
+ {0.673492431640625000, 0.165679931640625000, 0.125030517578125000,
+  0.279052734375000000, 0.675354003906250000, 0.045593261718750000,
+ -0.001907348632812500, 0.029968261718750000, 0.796844482421875000},
+
+ {0.609756469726562500, 0.205276489257812500, 0.149169921875000000,
+  0.311126708984375000, 0.625671386718750000, 0.063201904296875000,
+  0.019485473632812500, 0.060867309570312500, 0.744552612304687500},
+
+ {0.797714233398437500, 0.135208129882812500, 0.031280517578125000,
+  0.288070678710937500, 0.711868286132812500, 0.000061035156250000,
+  0.000015258789062500, 0.000015258789062500, 0.824874877929687500},
+
+ {0.475555419921875000, 0.339706420898437500, 0.148941040039062500,
+  0.255172729492187500, 0.672592163085937500, 0.072235107421875000,
+  0.018463134765625000, 0.113342285156250000, 0.693099975585937500},
+
+ {0.689895629882812500, 0.149765014648437500, 0.124542236328125000,
+  0.284530639648437500, 0.671691894531250000, 0.043777465820312500,
+ -0.006011962890625000, 0.009994506835937500, 0.820922851562500000},
+
+ {0.990905761718750000, 0.012222290039062500,-0.038925170898437500,
+  0.361907958984375000, 0.722503662109375000,-0.084411621093750000,
+ -0.002685546875000000, 0.008239746093750000, 0.819351196289062500},
+};
+
+
 /* round all values to s15f16 precision and brute-force
  * jitter +/- 1 all entries for best uniform gray axis - this
  * also optimizes the accuracy of the matrix for floating point
@@ -117,6 +147,26 @@ babl_matrix_equalize (double *in_mat)
   double best_error = 1000000.0;
   int i;
 
+  for (int i = 0; i < sizeof (equalized_matrices)/
+                      sizeof (equalized_matrices[0]); i++)
+  {
+    double diff_sum = 0.0f;
+    for (int j = 0; j < 9; j++){ 
+    double diff = equalized_matrices[i][j] - in_mat[j];
+    diff *= diff;
+    diff_sum += diff; }
+
+    // the threshold is based on being ~double the biggest
+    // difference seen in the default space set.
+
+    if (diff_sum < 0.000000005) { 
+      for (int j = 0; j < 9; j++){ 
+        in_mat[j] = equalized_matrices[i][j];
+      }
+      return;
+    }
+  }
+
   for (i = 0; i < 9; i++)
     best_j[i] = 0;
 
@@ -162,11 +212,23 @@ babl_matrix_equalize (double *in_mat)
       memcpy (&best_j[0], &j[0], sizeof (best_j));
     }
   }
+
   for (i = 0; i < 9; i++)
   {
     int32_t val = in_mat[i] * 65536.0 + 0.5f;
     in_mat[i] = val / 65536.0 + best_j[i] / 65536.0;
   }
+
+#if 0 // uncomment to generate code for pasting in cache
+  fprintf (stderr, "{");
+  for (i = 0; i < 9; i++)
+  {
+    if (i)
+      fprintf (stderr, ", ");
+    fprintf (stderr, "%.18f", in_mat[i]);
+  }
+  fprintf (stderr, "},\n");
+#endif
 }
 
 static void
@@ -244,7 +306,6 @@ _babl_space_for_lcms (const char *icc_data,
   memset (&space, 0, sizeof(space));
   space.instance.class_type = BABL_SPACE;
   space.instance.id         = 0;
-  space.icc_type = BablICCTypeCMYK;
 
   if (i >= MAX_SPACES-1)
   {
@@ -355,11 +416,13 @@ babl_space_from_rgbxyz_matrix (const char *name,
   if (name)
     snprintf (space_db[i].name, sizeof (space_db[i].name), "%s", name);
   else
-          /* XXX: this can get longer than 256bytes ! */
-    snprintf (space_db[i].name, sizeof (space_db[i].name),
+  {
+    snprintf (space_db[i].name, sizeof (space_db[i].name)-1,
              "space-%.4f,%.4f_%.4f,%.4f_%.4f,%.4f_%.4f,%.4f_%s,%s,%s",
              wx,wy,rx,ry,bx,by,gx,gy,babl_get_name (space.trc[0]),
              babl_get_name(space.trc[1]), babl_get_name(space.trc[2]));
+    space_db[i].name[sizeof (space_db[i].name)-1]=0;
+  }
 
   babl_space_get_icc ((Babl*)&space_db[i], NULL);
   return (Babl*)&space_db[i];
@@ -676,650 +739,6 @@ babl_space_get_rgbtoxyz (const Babl *space)
   return space->space.RGBtoXYZ;
 }
 
-///////////////////
-
-
-static void
-prep_conversion (const Babl *babl)
-{
-  Babl *conversion = (void*) babl;
-  const Babl *source_space = babl_conversion_get_source_space (conversion);
-  float *matrixf;
-  int i;
-  float *lut_red;
-  float *lut_green;
-  float *lut_blue;
-
-  double matrix[9];
-  babl_matrix_mul_matrix (
-     (conversion->conversion.destination)->format.space->space.XYZtoRGB,
-     (conversion->conversion.source)->format.space->space.RGBtoXYZ,
-     matrix);
-
-  matrixf = babl_calloc (sizeof (float), 9 + 256 * 3); // we leak this matrix , which is a singleton
-  babl_matrix_to_float (matrix, matrixf);
-  conversion->conversion.data = matrixf;
-
-  lut_red = matrixf + 9;
-  lut_green = lut_red + 256;
-  lut_blue = lut_green + 256;
-  for (i = 0; i < 256; i++)
-  {
-    lut_red[i] = babl_trc_to_linear (source_space->space.trc[0], i/255.0);
-    lut_green[i] = babl_trc_to_linear (source_space->space.trc[1], i/255.0);
-    lut_blue[i] = babl_trc_to_linear (source_space->space.trc[2], i/255.0);
-  }
-}
-
-#define TRC_IN(rgba_in, rgba_out)  do{ int i;\
-  for (i = 0; i < samples; i++) \
-  { \
-    rgba_out[i*4+3] = rgba_in[i*4+3]; \
-  } \
-  if ((source_space->space.trc[0] == source_space->space.trc[1]) && \
-      (source_space->space.trc[1] == source_space->space.trc[2])) \
-  { \
-    const Babl *trc = (void*)source_space->space.trc[0]; \
-    babl_trc_to_linear_buf(trc, rgba_in, rgba_out, 4, 4, 3, samples); \
-  } \
-  else \
-  { \
-    int c; \
-    for (c = 0; c < 3; c ++) \
-    { \
-      const Babl *trc = (void*)source_space->space.trc[c]; \
-      babl_trc_to_linear_buf(trc, rgba_in + c, rgba_out + c, 4, 4, 1, samples); \
-    } \
-  } \
-}while(0)
-
-#define TRC_OUT(rgba_in, rgba_out)  do{\
-  { \
-    int c; \
-    if ((destination_space->space.trc[0] == destination_space->space.trc[1]) && \
-        (destination_space->space.trc[1] == destination_space->space.trc[2])) \
-    { \
-      const Babl *trc = (void*)destination_space->space.trc[0]; \
-      babl_trc_from_linear_buf(trc, rgba_in, rgba_out, 4, 4, 3, samples); \
-    } \
-    else \
-    { \
-      for (c = 0; c < 3; c ++) \
-      { \
-        const Babl *trc = (void*)destination_space->space.trc[c]; \
-        babl_trc_from_linear_buf(trc, rgba_in + c, rgba_out + c, 4, 4, 1, samples); \
-      } \
-    } \
-  }\
-} while(0)
-
-
-
-
-static inline void
-universal_nonlinear_rgba_converter (const Babl    *conversion,
-                                    unsigned char *src_char,
-                                    unsigned char *dst_char,
-                                    long           samples,
-                                    void          *data)
-{
-  const Babl *source_space = babl_conversion_get_source_space (conversion);
-  const Babl *destination_space = babl_conversion_get_destination_space (conversion);
-
-  float * matrixf = data;
-  float *rgba_in = (void*)src_char;
-  float *rgba_out = (void*)dst_char;
-
-  TRC_IN(rgba_in, rgba_out);
-
-  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
-
-  TRC_OUT(rgba_out, rgba_out);
-}
-
-static inline void
-universal_nonlinear_rgb_linear_converter (const Babl    *conversion,
-                                          unsigned char *src_char,
-                                          unsigned char *dst_char,
-                                          long           samples,
-                                          void          *data)
-{
-  const Babl *source_space = babl_conversion_get_source_space (conversion);
-  float * matrixf = data;
-  float *rgba_in = (void*)src_char;
-  float *rgba_out = (void*)dst_char;
-
-  TRC_IN(rgba_in, rgba_out);
-
-  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
-}
-
-static inline void
-universal_linear_rgb_nonlinear_converter (const Babl    *conversion,
-                                          unsigned char *src_char,
-                                          unsigned char *dst_char,
-                                          long           samples,
-                                          void          *data)
-{
-  const Babl *destination_space = conversion->conversion.destination->format.space;
-  float * matrixf = data;
-  float *rgba_in = (void*)src_char;
-  float *rgba_out = (void*)dst_char;
-
-  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_in, rgba_out, samples);
-
-  TRC_OUT(rgba_out, rgba_out);
-}
-
-static inline void
-universal_nonlinear_rgba_u8_converter (const Babl    *conversion,
-                                       unsigned char *src_char,
-                                       unsigned char *dst_char,
-                                       long           samples,
-                                       void          *data)
-{
-  const Babl *destination_space = conversion->conversion.destination->format.space;
-
-  float * matrixf = data;
-  float * in_trc_lut_red = matrixf + 9;
-  float * in_trc_lut_green  = in_trc_lut_red + 256;
-  float * in_trc_lut_blue  = in_trc_lut_green + 256;
-  int i;
-  uint8_t *rgba_in_u8 = (void*)src_char;
-  uint8_t *rgba_out_u8 = (void*)dst_char;
-
-  float *rgb = babl_malloc (sizeof(float) * 4 * samples);
-
-  for (i = 0; i < samples; i++)
-  {
-    rgb[i*4+0]=in_trc_lut_red[rgba_in_u8[i*4+0]];
-    rgb[i*4+1]=in_trc_lut_green[rgba_in_u8[i*4+1]];
-    rgb[i*4+2]=in_trc_lut_blue[rgba_in_u8[i*4+2]];
-    rgba_out_u8[i*4+3] = rgba_in_u8[i*4+3];
-  }
-
-  babl_matrix_mul_vectorff_buf4 (matrixf, rgb, rgb, samples);
-
-  {
-    const Babl *from_trc_red   = (void*)destination_space->space.trc[0];
-    const Babl *from_trc_green = (void*)destination_space->space.trc[1];
-    const Babl *from_trc_blue  = (void*)destination_space->space.trc[2];
-    for (i = 0; i < samples * 4; i+=4)
-    {
-      rgba_out_u8[i+0] = babl_trc_from_linear (from_trc_red,   rgb[i+0]) * 255.5f;
-      rgba_out_u8[i+1] = babl_trc_from_linear (from_trc_green, rgb[i+1]) * 255.5f;
-      rgba_out_u8[i+2] = babl_trc_from_linear (from_trc_blue,  rgb[i+2]) * 255.5f;
-    }
-  }
-  babl_free (rgb);
-}
-
-
-static inline void
-universal_rgba_converter (const Babl    *conversion,
-                          unsigned char *src_char,
-                          unsigned char *dst_char,
-                          long           samples,
-                          void          *data)
-{
-  float *matrixf = data;
-  float *rgba_in = (void*)src_char;
-  float *rgba_out = (void*)dst_char;
-
-  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_in, rgba_out, samples);
-}
-
-static inline void
-universal_rgb_converter (const Babl    *conversion,
-                         unsigned char *src_char,
-                         unsigned char *dst_char,
-                         long           samples,
-                         void          *data)
-{
-  float *matrixf = data;
-  float *rgb_in = (void*)src_char;
-  float *rgb_out = (void*)dst_char;
-
-  babl_matrix_mul_vectorff_buf3 (matrixf, rgb_in, rgb_out, samples);
-}
-
-
-static inline void
-universal_ya_converter (const Babl    *conversion,
-                        unsigned char *src_char,
-                        unsigned char *dst_char,
-                        long           samples,
-                        void          *data)
-{
-  memcpy (dst_char, src_char, samples * 4 * 2);
-}
-
-static inline void
-universal_y_converter (const Babl    *conversion,
-                       unsigned char *src_char,
-                       unsigned char *dst_char,
-                       long           samples,
-                       void          *data)
-{
-  memcpy (dst_char, src_char, samples * 4);
-}
-
-
-static inline void
-universal_nonlinear_rgb_u8_converter (const Babl    *conversion,
-                                      unsigned char *src_char,
-                                      unsigned char *dst_char,
-                                      long           samples,
-                                      void          *data)
-{
-  const Babl *destination_space = conversion->conversion.destination->format.space;
-
-  float * matrixf = data;
-  float * in_trc_lut_red = matrixf + 9;
-  float * in_trc_lut_green = in_trc_lut_red + 256;
-  float * in_trc_lut_blue = in_trc_lut_green + 256;
-  int i;
-  uint8_t *rgb_in_u8 = (void*)src_char;
-  uint8_t *rgb_out_u8 = (void*)dst_char;
-
-  float *rgba_out = babl_malloc (sizeof(float) * 4 * samples);
-
-  for (i = 0; i < samples; i++)
-  {
-    rgba_out[i*4+0]=in_trc_lut_red[rgb_in_u8[i*3+0]];
-    rgba_out[i*4+1]=in_trc_lut_green[rgb_in_u8[i*3+1]];
-    rgba_out[i*4+2]=in_trc_lut_blue[rgb_in_u8[i*3+2]];
-    rgba_out[i*4+3]=rgb_in_u8[i*3+2] * 255.5f;
-  }
-
-  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
-
-  {
-    int c;
-    TRC_OUT(rgba_out, rgba_out);
-
-    for (i = 0; i < samples; i++)
-      for (c = 0; c < 3; c ++)
-        rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255.5f;
-  }
-
-  babl_free (rgba_out);
-}
-
-
-#if defined(USE_SSE2)
-
-#define m(matr, j, i)  matr[j*3+i]
-
-#include <emmintrin.h>
-
-static inline void babl_matrix_mul_vectorff_buf4_sse2 (const float *mat,
-                                                       const float *v_in,
-                                                       float       *v_out,
-                                                       int          samples)
-{
-  const __v4sf m___0 = {m(mat, 0, 0), m(mat, 1, 0), m(mat, 2, 0), 0};
-  const __v4sf m___1 = {m(mat, 0, 1), m(mat, 1, 1), m(mat, 2, 1), 0};
-  const __v4sf m___2 = {m(mat, 0, 2), m(mat, 1, 2), m(mat, 2, 2), 1};
-  int i;
-  for (i = 0; i < samples; i ++)
-  {
-    __v4sf a, b, c = _mm_load_ps(&v_in[0]);
-    a = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(0,0,0,0));
-    b = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(1,1,1,1));
-    c = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(3,2,2,2));
-    _mm_store_ps (v_out, m___0 * a + m___1 * b + m___2 * c);
-    v_out += 4;
-    v_in  += 4;
-  }
-  _mm_empty ();
-}
-
-#undef m
-
-
-static inline void
-universal_nonlinear_rgba_converter_sse2 (const Babl    *conversion,
-                                         unsigned char *src_char,
-                                         unsigned char *dst_char,
-                                         long           samples,
-                                         void          *data)
-{
-  const Babl *source_space = babl_conversion_get_source_space (conversion);
-  const Babl *destination_space = babl_conversion_get_destination_space (conversion);
-  float * matrixf = data;
-  float *rgba_in = (void*)src_char;
-  float *rgba_out = (void*)dst_char;
-
-  TRC_IN(rgba_in, rgba_out);
-
-  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
-
-  TRC_OUT(rgba_out, rgba_out);
-}
-
-
-static inline void
-universal_rgba_converter_sse2 (const Babl *conversion,
-                               unsigned char *src_char,
-                               unsigned char *dst_char,
-                               long samples,
-                               void *data)
-{
-  float *matrixf = data;
-  float *rgba_in = (void*)src_char;
-  float *rgba_out = (void*)dst_char;
-
-  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_in, rgba_out, samples);
-}
-
-static inline void
-universal_nonlinear_rgba_u8_converter_sse2 (const Babl    *conversion,
-                                            unsigned char *src_char,
-                                            unsigned char *dst_char,
-                                            long           samples,
-                                            void          *data)
-{
-  const Babl *destination_space = conversion->conversion.destination->format.space;
-
-  float * matrixf = data;
-  float * in_trc_lut_red = matrixf + 9;
-  float * in_trc_lut_green = in_trc_lut_red + 256;
-  float * in_trc_lut_blue = in_trc_lut_green + 256;
-  int i;
-  uint8_t *rgba_in_u8 = (void*)src_char;
-  uint8_t *rgba_out_u8 = (void*)dst_char;
-
-  float *rgba_out = babl_malloc (sizeof(float) * 4 * samples);
-
-  for (i = 0; i < samples * 4; i+= 4)
-  {
-    rgba_out[i+0]=in_trc_lut_red[rgba_in_u8[i+0]];
-    rgba_out[i+1]=in_trc_lut_green[rgba_in_u8[i+1]];
-    rgba_out[i+2]=in_trc_lut_blue[rgba_in_u8[i+2]];
-    rgba_out_u8[i+3] = rgba_in_u8[i+3];
-  }
-
-  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
-
-  {
-    int c;
-    TRC_OUT(rgba_out, rgba_out);
-
-    for (i = 0; i < samples * 4; i+= 4)
-      for (c = 0; c < 3; c ++)
-        rgba_out_u8[i+c] = rgba_out[i+c] * 255.5f;
-  }
-
-  babl_free (rgba_out);
-}
-
-static inline void
-universal_nonlinear_rgb_u8_converter_sse2 (const Babl    *conversion,
-                                           unsigned char *src_char,
-                                           unsigned char *dst_char,
-                                           long           samples,
-                                           void          *data)
-{
-  const Babl *destination_space = conversion->conversion.destination->format.space;
-
-  float * matrixf = data;
-  float * in_trc_lut_red = matrixf + 9;
-  float * in_trc_lut_green = in_trc_lut_red + 256;
-  float * in_trc_lut_blue = in_trc_lut_green + 256;
-  int i;
-  uint8_t *rgb_in_u8 = (void*)src_char;
-  uint8_t *rgb_out_u8 = (void*)dst_char;
-
-  float *rgba_out = babl_malloc (sizeof(float) * 4 * samples);
-
-  for (i = 0; i < samples; i++)
-  {
-    rgba_out[i*4+0]=in_trc_lut_red[rgb_in_u8[i*3+0]];
-    rgba_out[i*4+1]=in_trc_lut_green[rgb_in_u8[i*3+1]];
-    rgba_out[i*4+2]=in_trc_lut_blue[rgb_in_u8[i*3+2]];
-  }
-
-  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
-
-  {
-    int c;
-    TRC_OUT(rgba_out, rgba_out);
-
-    for (i = 0; i < samples; i++)
-      for (c = 0; c < 3; c ++)
-        rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255.5f;
-  }
-
-  babl_free (rgba_out);
-}
-
-
-static inline void
-universal_nonlinear_rgb_linear_converter_sse2 (const Babl    *conversion,
-                                               unsigned char *src_char,
-                                               unsigned char *dst_char,
-                                               long           samples,
-                                               void          *data)
-{
-  const Babl *source_space = babl_conversion_get_source_space (conversion);
-  float * matrixf = data;
-  float *rgba_in = (void*)src_char;
-  float *rgba_out = (void*)dst_char;
-
-  TRC_IN(rgba_in, rgba_out);
-
-  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
-}
-
-
-static inline void
-universal_linear_rgb_nonlinear_converter_sse2 (const Babl    *conversion,
-                                               unsigned char *src_char,
-                                               unsigned char *dst_char,
-                                               long           samples,
-                                               void          *data)
-{
-  const Babl *destination_space = conversion->conversion.destination->format.space;
-  float * matrixf = data;
-  float *rgba_in = (void*)src_char;
-  float *rgba_out = (void*)dst_char;
-
-  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_in, rgba_out, samples);
-
-  TRC_OUT(rgba_out, rgba_out);
-}
-#endif
-
-
-static int
-add_rgb_adapter (Babl *babl,
-                 void *space)
-{
-  if (babl != space)
-  {
-
-#if defined(USE_SSE2)
-    if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE) &&
-        (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2))
-    {
-
-
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("RGBA float", space),
-                       babl_format_with_space("RGBA float", babl),
-                       "linear", universal_rgba_converter_sse2,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("RGBA float", babl),
-                       babl_format_with_space("RGBA float", space),
-                       "linear", universal_rgba_converter_sse2,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A float", space),
-                       babl_format_with_space("R'G'B'A float", babl),
-                       "linear", universal_nonlinear_rgba_converter_sse2,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A float", babl),
-                       babl_format_with_space("R'G'B'A float", space),
-                       "linear", universal_nonlinear_rgba_converter_sse2,
-                       NULL));
-
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A float", space),
-                       babl_format_with_space("RGBA float", babl),
-                       "linear", universal_nonlinear_rgb_linear_converter_sse2,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A float", babl),
-                       babl_format_with_space("RGBA float", space),
-                       "linear", universal_nonlinear_rgb_linear_converter_sse2,
-                       NULL));
-
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("RGBA float", babl),
-                       babl_format_with_space("R'G'B'A float", space),
-                       "linear", universal_linear_rgb_nonlinear_converter_sse2,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("RGBA float", space),
-                       babl_format_with_space("R'G'B'A float", babl),
-                       "linear", universal_linear_rgb_nonlinear_converter_sse2,
-                       NULL));
-
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A u8", space),
-                       babl_format_with_space("R'G'B'A u8", babl),
-                       "linear", universal_nonlinear_rgba_u8_converter_sse2,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A u8", babl),
-                       babl_format_with_space("R'G'B'A u8", space),
-                       "linear", universal_nonlinear_rgba_u8_converter_sse2,
-                       NULL));
-
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B' u8", space),
-                       babl_format_with_space("R'G'B' u8", babl),
-                       "linear", universal_nonlinear_rgb_u8_converter_sse2,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B' u8", babl),
-                       babl_format_with_space("R'G'B' u8", space),
-                       "linear", universal_nonlinear_rgb_u8_converter_sse2,
-                       NULL));
-    }
-    //else
-#endif
-    {
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("RGBA float", space),
-                       babl_format_with_space("RGBA float", babl),
-                       "linear", universal_rgba_converter,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("RGBA float", babl),
-                       babl_format_with_space("RGBA float", space),
-                       "linear", universal_rgba_converter,
-                       NULL));
-
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A float", space),
-                       babl_format_with_space("R'G'B'A float", babl),
-                       "linear", universal_nonlinear_rgba_converter,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A float", babl),
-                       babl_format_with_space("R'G'B'A float", space),
-                       "linear", universal_nonlinear_rgba_converter,
-                       NULL));
-
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A float", space),
-                       babl_format_with_space("RGBA float", babl),
-                       "linear", universal_nonlinear_rgb_linear_converter,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A float", babl),
-                       babl_format_with_space("RGBA float", space),
-                       "linear", universal_nonlinear_rgb_linear_converter,
-                       NULL));
-
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A u8", space),
-                       babl_format_with_space("R'G'B'A u8", babl),
-                       "linear", universal_nonlinear_rgba_u8_converter,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B'A u8", babl),
-                       babl_format_with_space("R'G'B'A u8", space),
-                       "linear", universal_nonlinear_rgba_u8_converter,
-                       NULL));
-
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B' u8", space),
-                       babl_format_with_space("R'G'B' u8", babl),
-                       "linear", universal_nonlinear_rgb_u8_converter,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("R'G'B' u8", babl),
-                       babl_format_with_space("R'G'B' u8", space),
-                       "linear", universal_nonlinear_rgb_u8_converter,
-                       NULL));
-
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("RGBA float", babl),
-                       babl_format_with_space("R'G'B'A float", space),
-                       "linear", universal_linear_rgb_nonlinear_converter,
-                       NULL));
-       prep_conversion(babl_conversion_new(
-                       babl_format_with_space("RGBA float", space),
-                       babl_format_with_space("R'G'B'A float", babl),
-                       "linear", universal_linear_rgb_nonlinear_converter,
-                       NULL));
-    }
-
-    prep_conversion(babl_conversion_new(
-                    babl_format_with_space("RGB float", space),
-                    babl_format_with_space("RGB float", babl),
-                    "linear", universal_rgb_converter,
-                    NULL));
-    prep_conversion(babl_conversion_new(
-                    babl_format_with_space("RGB float", babl),
-                    babl_format_with_space("RGB float", space),
-                    "linear", universal_rgb_converter,
-                    NULL));
-
-    prep_conversion(babl_conversion_new(
-                    babl_format_with_space("Y float", space),
-                    babl_format_with_space("Y float", babl),
-                    "linear", universal_y_converter,
-                    NULL));
-    prep_conversion(babl_conversion_new(
-                    babl_format_with_space("YaA float", babl),
-                    babl_format_with_space("YaA float", space),
-                    "linear", universal_ya_converter,
-                    NULL));
-    prep_conversion(babl_conversion_new(
-                    babl_format_with_space("YA float", babl),
-                    babl_format_with_space("YA float", space),
-                    "linear", universal_ya_converter,
-                    NULL));
-  }
-  return 0;
-}
-
-/* The first time a new Babl space is used - for creation of a fish, is when
- * this function is called, it adds conversions hooks that provides its formats
- * with conversions internally as well as for conversions to and from other RGB
- * spaces.
- */
-void
-_babl_space_add_universal_rgb (const Babl *space)
-{
-  babl_space_class_for_each (add_rgb_adapter, (void*)space);
-}
 
 
 const Babl *
diff --git a/babl/babl-space.h b/babl/babl-space.h
index 86692e9..b9b5dd4 100644
--- a/babl/babl-space.h
+++ b/babl/babl-space.h
@@ -171,5 +171,10 @@ babl_space_from_gray_trc (const char *name,
                           const Babl *trc_gray,
                           BablSpaceFlags flags);
 
+void
+babl_chromatic_adaptation_matrix (const double *whitepoint,
+                                  const double *target_whitepoint,
+                                  double       *chad_matrix);
+
 
 #endif
diff --git a/babl/babl-util.c b/babl/babl-util.c
index cab9ab1..2b78a5e 100644
--- a/babl/babl-util.c
+++ b/babl/babl-util.c
@@ -17,14 +17,22 @@
  */
 
 #include "config.h"
+#include <stdarg.h>
+#include <limits.h>
+#include <assert.h>
+#include <stdio.h>
 #include <math.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include "babl-internal.h"
 
 #ifdef __WIN32__
 #include <windows.h>
+#include <wchar.h>
 #else
 #include <sys/time.h>
 #include <time.h>
+#include <dirent.h>
 #endif
 
 #ifdef __WIN32__
@@ -101,6 +109,223 @@ babl_rel_avg_error (const double *imgA,
   return error;
 }
 
+size_t
+add_check_overflow (size_t numbers_count, ...)
+{
+  size_t result = 0;
+  va_list args;
+
+  assert (numbers_count > 0);
+
+  va_start (args, numbers_count);
+  while (numbers_count--)
+    {
+      size_t addendum = va_arg (args, size_t);
+
+      if ((SIZE_MAX - result) < addendum)
+        {
+          result = 0;
+          break;
+        }
+
+      result += addendum;
+    }
+  va_end (args);
+
+  return result;
+}
+
+size_t
+mul_check_overflow (size_t numbers_count, ...)
+{
+  size_t result = 1;
+  va_list args;
+
+  assert (numbers_count > 0);
+
+  va_start (args, numbers_count);
+  while (numbers_count--)
+    {
+      size_t factor = va_arg (args, size_t);
+
+      if ((SIZE_MAX / result) < factor)
+        {
+          result = 0;
+          break;
+        }
+
+      result *= factor;
+    }
+  va_end (args);
+
+  return result;
+}
+
+FILE *
+_babl_fopen (const char *path,
+             const char *mode)
+{
+#ifndef _WIN32
+  return fopen (path, mode);
+#else
+  wchar_t *path_utf16 = babl_convert_utf8_to_utf16 (path);
+  wchar_t *mode_utf16 = babl_convert_utf8_to_utf16 (mode);
+  FILE *result = NULL;
+
+  result = _wfopen (path_utf16, mode_utf16);
+
+  if (path_utf16)
+    babl_free (path_utf16);
+
+  if (mode_utf16)
+    babl_free (mode_utf16);
+
+  return result;
+#endif
+}
+
+int
+_babl_remove (const char *path)
+{
+#ifndef _WIN32
+  return remove (path);
+#else
+  wchar_t *path_utf16 = babl_convert_utf8_to_utf16 (path);
+  int result = 0;
+
+  result = _wremove (path_utf16);
+
+  if (path_utf16)
+    babl_free (path_utf16);
+
+  return result;
+#endif
+}
+
+int
+_babl_rename (const char *oldname,
+              const char *newname)
+{
+#ifndef _WIN32
+  return rename (oldname, newname);
+#else
+  wchar_t *oldname_utf16 = babl_convert_utf8_to_utf16 (oldname);
+  wchar_t *newname_utf16 = babl_convert_utf8_to_utf16 (newname);
+  int result = 0;
+
+  result = _wrename (oldname_utf16, newname_utf16);
+
+  if (oldname_utf16)
+    babl_free (oldname_utf16);
+
+  if (newname_utf16)
+    babl_free (newname_utf16);
+
+  return result;
+#endif
+}
+
+int
+_babl_stat (const char *path,
+            BablStat   *buffer)
+{
+#ifndef _WIN32
+  return stat (path, buffer);
+#else
+  wchar_t *path_utf16 = babl_convert_utf8_to_utf16 (path);
+  int result = 0;
+
+  result = _wstat64 (path_utf16, buffer);
+
+  if (path_utf16)
+    babl_free (path_utf16);
+
+  return result;
+#endif
+}
+
+int
+_babl_mkdir (const char *path,
+             int         mode)
+{
+#ifndef _WIN32
+  return mkdir (path, (mode_t) mode);
+#else
+  wchar_t *path_utf16 = babl_convert_utf8_to_utf16 (path);
+  int result = 0;
+  (void) mode;
+
+  result = _wmkdir (path_utf16);
+
+  if (path_utf16)
+    babl_free (path_utf16);
+
+  return result;
+#endif
+}
+
+void
+_babl_dir_foreach (const char             *path,
+                   _babl_dir_foreach_cb_t  callback,
+                   void                   *user_data)
+{
+#ifndef _WIN32
+  DIR *dir = opendir (path);
+
+  if (!path)
+    return;
+
+  if (dir != NULL)
+    {
+      struct dirent *dentry;
+
+      while ((dentry = readdir (dir)))
+        callback (path, dentry->d_name, user_data);
+
+      closedir (dir);
+    }
+#else
+  char *search = NULL;
+  wchar_t *search_utf16 = NULL;
+  struct _wfinddata64_t info;
+  intptr_t search_id = 0;
+
+  if (!path)
+    return;
+
+  search = babl_strcat (search, path);
+  search = babl_strcat (search, "\\*");
+  search_utf16 = babl_convert_utf8_to_utf16 (search);
+  if (!search_utf16)
+    goto cleanup;
+
+  memset (&info, 0, sizeof (info));
+  if ((search_id = _wfindfirst64 (search_utf16, &info)) != (intptr_t)-1)
+    {
+      do
+        {
+          char *entry = babl_convert_utf16_to_utf8 (info.name);
+
+          if (entry)
+            {
+              callback (path, entry, user_data);
+              babl_free (entry);
+            }
+        }
+      while (_wfindnext64 (search_id, &info) == 0);
+
+      _findclose (search_id);
+    }
+
+cleanup:
+  if (search_utf16)
+    babl_free (search_utf16);
+
+  if (search)
+    babl_free (search);
+#endif
+}
+
 int
 _babl_file_get_contents (const char  *path,
                          char       **contents,
@@ -111,7 +336,7 @@ _babl_file_get_contents (const char  *path,
   long  size;
   char *buffer;
 
-  file = fopen (path,"rb");
+  file = _babl_fopen (path, "rb");
 
   if (!file)
     return -1;
@@ -148,3 +373,82 @@ _babl_file_get_contents (const char  *path,
   return 0;
 }
 
+#ifdef _WIN32
+
+wchar_t *
+babl_convert_utf8_to_utf16 (const char *str)
+{
+  int wchar_count = 0;
+  wchar_t *wstr = NULL;
+
+  if (!str)
+    return NULL;
+
+  wchar_count = MultiByteToWideChar (CP_UTF8,
+                                     MB_ERR_INVALID_CHARS,
+                                     str, -1,
+                                     NULL, 0);
+  if (wchar_count <= 0)
+    return NULL;
+
+  wstr = babl_malloc (wchar_count * sizeof (wchar_t));
+  if (!wstr)
+    return NULL;
+
+  wchar_count = MultiByteToWideChar (CP_UTF8,
+                                     MB_ERR_INVALID_CHARS,
+                                     str, -1,
+                                     wstr, wchar_count);
+  if (wchar_count <= 0)
+    {
+      babl_free (wstr);
+      return NULL;
+    }
+
+  return wstr;
+}
+
+char *
+babl_convert_utf16_to_utf8 (const wchar_t *wstr)
+{
+  int char_count = 0;
+  char *str = NULL;
+
+  if (!wstr)
+    return NULL;
+
+  char_count = WideCharToMultiByte (CP_UTF8,
+                                    WC_ERR_INVALID_CHARS,
+                                    wstr, -1,
+                                    NULL, 0,
+                                    NULL, NULL);
+  if (char_count <= 0)
+    return NULL;
+
+  str = babl_malloc (char_count);
+  if (!str)
+    return NULL;
+
+  char_count = WideCharToMultiByte (CP_UTF8,
+                                    WC_ERR_INVALID_CHARS,
+                                    wstr, -1,
+                                    str, char_count,
+                                    NULL, NULL);
+  if (char_count <= 0)
+    {
+      babl_free (str);
+      return NULL;
+    }
+
+  return str;
+}
+
+extern IMAGE_DOS_HEADER __ImageBase;
+
+void *
+get_libbabl_module (void)
+{
+  return &__ImageBase;
+}
+
+#endif /* _WIN32 */
diff --git a/babl/babl-util.h b/babl/babl-util.h
index 9caec36..fce8e8f 100644
--- a/babl/babl-util.h
+++ b/babl/babl-util.h
@@ -19,6 +19,17 @@
 #ifndef _BABL_UTIL_H
 #define _BABL_UTIL_H
 
+#include <stddef.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#ifndef _WIN32
+typedef struct stat BablStat;
+#else
+typedef struct _stat64 BablStat;
+#endif
+
 long
 babl_ticks     (void);
 
@@ -26,4 +37,53 @@ double
 babl_rel_avg_error (const double *imgA,
                     const double *imgB,
                     long          samples);
+
+size_t
+add_check_overflow (size_t numbers_count, ...);
+
+size_t
+mul_check_overflow (size_t numbers_count, ...);
+
+FILE *
+_babl_fopen (const char *path,
+             const char *mode);
+
+int
+_babl_remove (const char *path);
+
+int
+_babl_rename (const char *oldname,
+              const char *newname);
+
+int
+_babl_stat (const char *path,
+            BablStat   *buffer);
+
+int
+_babl_mkdir (const char *path,
+             int         mode);
+
+typedef void
+(*_babl_dir_foreach_cb_t) (const char *base_path,
+                           const char *entry,
+                           void       *data);
+
+void
+_babl_dir_foreach (const char             *path,
+                   _babl_dir_foreach_cb_t  callback,
+                   void                   *user_data);
+
+#ifdef _WIN32
+
+wchar_t *
+babl_convert_utf8_to_utf16 (const char *str);
+
+char *
+babl_convert_utf16_to_utf8 (const wchar_t *wstr);
+
+void *
+get_libbabl_module (void);
+
+#endif /* _WIN32 */
+
 #endif
diff --git a/babl/babl.c b/babl/babl.c
index fd90323..23db1a3 100644
--- a/babl/babl.c
+++ b/babl/babl.c
@@ -18,38 +18,13 @@
 
 #include "config.h"
 #include "babl-internal.h"
+#include "babl-base.h"
 
 static int ref_count = 0;
 
-#ifdef _WIN32
-static HMODULE libbabl_dll = NULL;
-
-/* Minimal DllMain that just stores the handle to this DLL */
-
-/* Avoid silly "no previous prototype" gcc warning */
-BOOL WINAPI
-DllMain (HINSTANCE hinstDLL,
-         DWORD     fdwReason,
-         LPVOID    lpvReserved);
-
-BOOL WINAPI
-DllMain (HINSTANCE hinstDLL,
-         DWORD     fdwReason,
-         LPVOID    lpvReserved)
-{
-  switch (fdwReason)
-    {
-      case DLL_PROCESS_ATTACH:
-        libbabl_dll = hinstDLL;
-        break;
-    }
-
-  return TRUE;
-}
-
-#else
+#ifndef _WIN32
 #define BABL_PATH              LIBDIR BABL_DIR_SEPARATOR BABL_LIBRARY
-#endif /* _WIN32 */
+#endif
 
 /*
  * Returns a list of directories if the environment variable $BABL_PATH
@@ -69,21 +44,16 @@ babl_dir_list (void)
     {
 #ifdef _WIN32
       /* Figure it out from the location of this DLL */
-      char *filename;
-      int filename_size;
-      char *sep1, *sep2;
-
       wchar_t w_filename[MAX_PATH];
+      char *filename = NULL;
+      char *sep1, *sep2;
       DWORD nSize = sizeof (w_filename) / sizeof ((w_filename)[0]);
 
-      if (GetModuleFileNameW (libbabl_dll, w_filename, nSize) == 0)
+      if (GetModuleFileNameW (get_libbabl_module (), w_filename, nSize) == 0)
         babl_fatal ("GetModuleFilenameW failed");
 
-      filename_size = WideCharToMultiByte (CP_UTF8, 0, w_filename, -1, NULL, 0,
-                                           NULL, NULL);
-      filename = babl_malloc (sizeof (char) * filename_size);
-      if (!WideCharToMultiByte (CP_UTF8, 0, w_filename, -1,
-                                filename, filename_size, NULL, NULL))
+      filename = babl_convert_utf16_to_utf8 (w_filename);
+      if (!filename)
         babl_fatal ("Converting module filename to UTF-8 failed");
 
       /* If the DLL file name is of the format
@@ -125,10 +95,14 @@ babl_dir_list (void)
   return ret;
 }
 
+
+static const char **simd_init (void);
 void
 babl_init (void)
 {
+  const char **exclusion_pattern;
   babl_cpu_accel_set_use (1);
+  exclusion_pattern = simd_init ();
 
   if (ref_count++ == 0)
     {
@@ -139,6 +113,7 @@ babl_init (void)
       babl_type_db ();
       babl_trc_class_init ();
       babl_space_class_init ();
+      _babl_legal_error ();
       babl_component_db ();
       babl_model_db ();
       babl_format_db ();
@@ -151,10 +126,11 @@ babl_init (void)
       babl_sanity ();
 
       dir_list = babl_dir_list ();
-      babl_extension_load_dir_list (dir_list);
+      babl_extension_load_dir_list (dir_list, exclusion_pattern);
       babl_free (dir_list);
 
-      babl_init_db ();
+      if (!getenv ("BABL_INHIBIT_CACHE"))
+        babl_init_db ();
     }
 }
 
@@ -190,3 +166,123 @@ babl_model_is (const Babl *babl,
   return babl && ((babl)==babl_model_with_space(model, babl));
 }
 
+
+#include "babl-cpuaccel.h"
+void (*babl_base_init)  (void) = babl_base_init_generic;
+
+const Babl * babl_trc_lookup_by_name_generic (const char *name);
+
+
+const Babl *
+babl_trc_new_generic (const char *name,
+                      BablTRCType type,
+                      double      gamma,
+                      int         n_lut,
+                      float      *lut);
+
+void _babl_space_add_universal_rgb_generic (const Babl *space);
+void (*_babl_space_add_universal_rgb) (const Babl *space) =
+  _babl_space_add_universal_rgb_generic;
+
+const Babl *
+(*babl_trc_lookup_by_name) (const char *name) = babl_trc_lookup_by_name_generic;
+const Babl *
+(*babl_trc_new) (const char *name,
+              BablTRCType type,
+              double      gamma,
+              int         n_lut,
+              float      *lut) = babl_trc_new_generic;
+
+#ifdef ARCH_X86_64
+void babl_base_init_x86_64_v2 (void);
+void babl_base_init_x86_64_v3 (void);
+void _babl_space_add_universal_rgb_x86_64_v2 (const Babl *space);
+void _babl_space_add_universal_rgb_x86_64_v3 (const Babl *space);
+
+const Babl *
+babl_trc_lookup_by_name_x86_64_v2 (const char *name);
+const Babl *
+babl_trc_lookup_by_name_x86_64_v3 (const char *name);
+
+const Babl *
+babl_trc_new_x86_64_v2 (const char *name,
+                        BablTRCType type,
+                        double      gamma,
+                        int         n_lut,
+                        float      *lut);
+const Babl *
+babl_trc_new_x86_64_v3 (const char *name,
+                        BablTRCType type,
+                        double      gamma,
+                        int         n_lut,
+                        float      *lut);
+
+#endif
+#ifdef ARCH_ARM
+void babl_base_init_arm_neon (void);
+void _babl_space_add_universal_rgb_arm_neon (const Babl *space);
+
+const Babl *
+babl_trc_lookup_by_name_arm_neon (const char *name);
+
+const Babl *
+babl_trc_new_arm_neon (const char *name,
+                       BablTRCType type,
+                       double      gamma,
+                       int         n_lut,
+                       float      *lut);
+
+#endif
+
+static const char **simd_init (void)
+{
+  static const char *exclude[] = {"neon-", "x86-64-v3", "x86-64-v2", NULL};
+#ifdef ARCH_X86_64
+  BablCpuAccelFlags accel = babl_cpu_accel_get_support ();
+  if ((accel & BABL_CPU_ACCEL_X86_64_V3) == BABL_CPU_ACCEL_X86_64_V3)
+  {
+    static const char *exclude[] = {NULL};
+    babl_base_init = babl_base_init_x86_64_v2; /// !!
+                                               // this is correct,
+                                               // it performs better
+                                               // as observed in benchmarking
+    babl_trc_new = babl_trc_new_x86_64_v2;
+    babl_trc_lookup_by_name = babl_trc_lookup_by_name_x86_64_v2;
+    _babl_space_add_universal_rgb = _babl_space_add_universal_rgb_x86_64_v3;
+    return exclude;
+  }
+  else if ((accel & BABL_CPU_ACCEL_X86_64_V2) == BABL_CPU_ACCEL_X86_64_V2)
+  {
+    static const char *exclude[] = {"x86-64-v3-", NULL};
+    babl_base_init = babl_base_init_x86_64_v2;
+    babl_trc_new = babl_trc_new_x86_64_v2;
+    babl_trc_lookup_by_name = babl_trc_lookup_by_name_x86_64_v2;
+    _babl_space_add_universal_rgb = _babl_space_add_universal_rgb_x86_64_v2;
+    return exclude;
+  }
+  else
+  {
+    static const char *exclude[] = {"x86-64-v3-", "x86-64-v2-", NULL};
+    return exclude;
+  }
+#endif
+#ifdef ARCH_ARM
+  BablCpuAccelFlags accel = babl_cpu_accel_get_support ();
+  if ((accel & BABL_CPU_ACCEL_ARM_NEON) == BABL_CPU_ACCEL_ARM_NEON)
+  {
+    static const char *exclude[] = {NULL};
+    babl_base_init = babl_base_init_arm_neon;
+    babl_trc_new = babl_trc_new_arm_neon;
+    babl_trc_lookup_by_name = babl_trc_lookup_by_name_arm_neon;
+    _babl_space_add_universal_rgb = _babl_space_add_universal_rgb_arm_neon;
+    return exclude;
+  }
+  else
+  {
+    static const char *exclude[] = {"arm-neon-", NULL};
+    return exclude;
+  }
+#endif
+  return exclude;
+}
+
diff --git a/babl/babl.h b/babl/babl.h
index 4032faa..a9b47a3 100644
--- a/babl/babl.h
+++ b/babl/babl.h
@@ -636,6 +636,11 @@ void babl_space_get (const Babl *space,
  * @blue_luminance: (out) (optional): Location for the blue luminance factor.
  *
  * Retrieve the relevant RGB luminance constants for a babl space.
+ *
+ * Note: these luminance coefficients should only ever be used on linear data.
+ * If your input @space is non-linear, you should convert your pixel values to
+ * the linearized variant of @space before making any computation with these
+ * coefficients. See #83.
  */
 void
 babl_space_get_rgb_luminance (const Babl *space,
@@ -713,6 +718,29 @@ const char * babl_format_get_encoding (const Babl *babl);
 int babl_space_is_cmyk (const Babl *space);
 int babl_space_is_gray (const Babl *space);
 
+typedef void (*BablFishProcess) (const Babl *babl, const char *src, char *dst, long n, void *data);
+/**
+ * babl_fish_get_process: (skip)
+ *
+ * get the dispatch function of a fish, this allows faster use of a fish
+ * in a loop than the more indirect method of babl_process, this also avoids
+ * base-level instrumentation.
+ */
+BablFishProcess babl_fish_get_process (const Babl *babl);
+
+
+/**
+ * babl_gc: (skip)
+ *
+ * Do a babl fish garbage collection cycle, should only be called
+ * from the main thread with no concurrent babl processing in other
+ * threads in paralell.
+ *
+ * Since: babl-0.1.98
+ */
+void babl_gc (void);
+
+
 /* values below this are stored associated with this value, it should also be
  * used as a generic alpha zero epsilon in GEGL to keep the threshold effects
  * on one known value.
diff --git a/babl/babl/meson.build b/babl/babl/meson.build
new file mode 100644
index 0000000..461625a
--- /dev/null
+++ b/babl/babl/meson.build
@@ -0,0 +1,9 @@
+# Copy the public headers here for subproject builds.
+
+foreach _hdr : babl_headers
+  configure_file(
+    input: _hdr,
+    output: '@PLAINNAME@',
+    copy: true,
+  )
+endforeach
diff --git a/babl/base/babl-base.c b/babl/base/babl-base.c
index 1d93341..8b9cdde 100644
--- a/babl/base/babl-base.c
+++ b/babl/base/babl-base.c
@@ -25,19 +25,19 @@ static void types (void);
 static void models (void);
 
 void
-babl_base_init (void)
+BABL_SIMD_SUFFIX(babl_base_init) (void)
 {
   babl_hmpf_on_name_lookups++;
 
   types ();
   models ();
-  babl_formats_init ();
+  BABL_SIMD_SUFFIX (babl_formats_init) ();
 
   babl_hmpf_on_name_lookups--;
 }
 
 void
-babl_base_destroy (void)
+BABL_SIMD_SUFFIX(babl_base_destroy) (void)
 {
   /* done by the destruction of the elemental babl clases */
 }
@@ -50,12 +50,12 @@ babl_base_destroy (void)
 static void
 types (void)
 {
-  babl_base_type_float ();
-  babl_base_type_u15 ();
-  babl_base_type_half ();
-  babl_base_type_u8 ();
-  babl_base_type_u16 ();
-  babl_base_type_u32 ();
+  BABL_SIMD_SUFFIX (babl_base_type_float) ();
+  BABL_SIMD_SUFFIX (babl_base_type_u15) ();
+  BABL_SIMD_SUFFIX (babl_base_type_half) ();
+  BABL_SIMD_SUFFIX (babl_base_type_u8) ();
+  BABL_SIMD_SUFFIX (babl_base_type_u16) ();
+  BABL_SIMD_SUFFIX (babl_base_type_u32) ();
 }
 
 /*
@@ -67,9 +67,9 @@ static void
 models (void)
 {
   babl_hmpf_on_name_lookups--;
-  babl_base_model_rgb ();
-  babl_base_model_gray ();
-  babl_base_model_cmyk ();
+  BABL_SIMD_SUFFIX (babl_base_model_rgb) ();
+  BABL_SIMD_SUFFIX (babl_base_model_gray) ();
+  BABL_SIMD_SUFFIX (babl_base_model_cmyk) ();
   babl_hmpf_on_name_lookups++;
-  babl_base_model_ycbcr ();
+  BABL_SIMD_SUFFIX (babl_base_model_ycbcr) ();
 }
diff --git a/babl/base/babl-base.h b/babl/base/babl-base.h
index 64f1667..291697b 100644
--- a/babl/base/babl-base.h
+++ b/babl/base/babl-base.h
@@ -19,22 +19,36 @@
 #ifndef _BABL_BASE_H
 #define _BABL_BASE_H
 
+#ifdef ARM_NEON
+#define BABL_SIMD_SUFFIX(symbol) symbol##_arm_neon
+#else
+#ifdef X86_64_V2
+#define BABL_SIMD_SUFFIX(symbol) symbol##_x86_64_v2
+#else 
+#ifdef X86_64_V3
+#define BABL_SIMD_SUFFIX(symbol) symbol##_x86_64_v3
+#else
+#define BABL_SIMD_SUFFIX(symbol) symbol##_generic
+#endif
+#endif
+#endif
+
+extern void (*babl_base_init)    (void);
 
-void babl_base_init (void);
-void babl_base_destroy (void);
-void babl_formats_init (void);
+void BABL_SIMD_SUFFIX(babl_base_init)    (void);
+void BABL_SIMD_SUFFIX(babl_base_destroy) (void);
+void BABL_SIMD_SUFFIX(babl_formats_init) (void);
 
-void babl_base_type_half   (void);
-void babl_base_type_float  (void);
-void babl_base_type_u8     (void);
-void babl_base_type_u16    (void);
-void babl_base_type_u15    (void);
-void babl_base_type_u32    (void);
+void BABL_SIMD_SUFFIX(babl_base_type_half) (void);
+void BABL_SIMD_SUFFIX(babl_base_type_float)  (void);
+void BABL_SIMD_SUFFIX(babl_base_type_u8)     (void);
+void BABL_SIMD_SUFFIX(babl_base_type_u16)    (void);
+void BABL_SIMD_SUFFIX(babl_base_type_u15)    (void);
+void BABL_SIMD_SUFFIX(babl_base_type_u32)    (void);
 
-void babl_base_model_pal   (void);
-void babl_base_model_rgb   (void);
-void babl_base_model_cmyk  (void);
-void babl_base_model_gray  (void);
-void babl_base_model_ycbcr (void);
+void BABL_SIMD_SUFFIX(babl_base_model_rgb)   (void);
+void BABL_SIMD_SUFFIX(babl_base_model_cmyk)  (void);
+void BABL_SIMD_SUFFIX(babl_base_model_gray)  (void);
+void BABL_SIMD_SUFFIX(babl_base_model_ycbcr) (void);
 
 #endif
diff --git a/babl/base/babl-rgb-converter.c b/babl/base/babl-rgb-converter.c
new file mode 100644
index 0000000..e0ba7c3
--- /dev/null
+++ b/babl/base/babl-rgb-converter.c
@@ -0,0 +1,536 @@
+#include "config.h"
+#include "babl-internal.h"
+#include "base/util.h"
+#include "babl-trc.h"
+#include "babl-base.h"
+
+static void
+prep_conversion (const Babl *babl)
+{
+  Babl *conversion = (void*) babl;
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
+  float *matrixf;
+  unsigned int i;
+  float *lut_red;
+  float *lut_green;
+  float *lut_blue;
+
+  double matrix[9];
+  babl_matrix_mul_matrix (
+     (conversion->conversion.destination)->format.space->space.XYZtoRGB,
+     (conversion->conversion.source)->format.space->space.RGBtoXYZ,
+     matrix);
+
+  matrixf = babl_calloc (sizeof (float), 9 + 256 * 3); // we leak this matrix , which is a singleton
+  babl_matrix_to_float (matrix, matrixf);
+  conversion->conversion.data = matrixf;
+
+  lut_red = matrixf + 9;
+  lut_green = lut_red + 256;
+  lut_blue = lut_green + 256;
+  for (i = 0; i < 256; i++)
+  {
+    lut_red[i] = babl_trc_to_linear (source_space->space.trc[0], i/255.0);
+    lut_green[i] = babl_trc_to_linear (source_space->space.trc[1], i/255.0);
+    lut_blue[i] = babl_trc_to_linear (source_space->space.trc[2], i/255.0);
+  }
+}
+
+#define TRC_IN(rgba_in, rgba_out)  do{ int i;\
+  for (i = 0; i < samples; i++) \
+  { \
+    rgba_out[i*4+3] = rgba_in[i*4+3]; \
+  } \
+  if ((source_space->space.trc[0] == source_space->space.trc[1]) && \
+      (source_space->space.trc[1] == source_space->space.trc[2])) \
+  { \
+    const Babl *trc = (void*)source_space->space.trc[0]; \
+    babl_trc_to_linear_buf(trc, rgba_in, rgba_out, 4, 4, 3, samples); \
+  } \
+  else \
+  { \
+    unsigned int c; \
+    for (c = 0; c < 3; c ++) \
+    { \
+      const Babl *trc = (void*)source_space->space.trc[c]; \
+      babl_trc_to_linear_buf(trc, rgba_in + c, rgba_out + c, 4, 4, 1, samples); \
+    } \
+  } \
+}while(0)
+
+#define TRC_OUT(rgba_in, rgba_out)  do{\
+  { \
+    if ((destination_space->space.trc[0] == destination_space->space.trc[1]) && \
+        (destination_space->space.trc[1] == destination_space->space.trc[2])) \
+    { \
+      const Babl *trc = (void*)destination_space->space.trc[0]; \
+      babl_trc_from_linear_buf(trc, rgba_in, rgba_out, 4, 4, 3, samples); \
+    } \
+    else \
+    { \
+      unsigned int c; \
+      for (c = 0; c < 3; c ++) \
+      { \
+        const Babl *trc = (void*)destination_space->space.trc[c]; \
+        babl_trc_from_linear_buf(trc, rgba_in + c, rgba_out + c, 4, 4, 1, samples); \
+      } \
+    } \
+  }\
+} while(0)
+
+
+static inline void
+universal_nonlinear_rgba_converter (const Babl    *conversion,
+                                    unsigned char *__restrict__ src_char,
+                                    unsigned char *__restrict__ dst_char,
+                                    long           samples,
+                                    void          *data)
+{
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
+  const Babl *destination_space = babl_conversion_get_destination_space (conversion);
+
+  float * matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  TRC_IN(rgba_in, rgba_out);
+
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
+
+  TRC_OUT(rgba_out, rgba_out);
+}
+
+static inline void
+universal_nonlinear_rgb_linear_converter (const Babl    *conversion,
+                                          unsigned char *__restrict__ src_char,
+                                          unsigned char *__restrict__ dst_char,
+                                          long           samples,
+                                          void          *data)
+{
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
+  float * matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  TRC_IN(rgba_in, rgba_out);
+
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
+}
+
+static inline void
+universal_linear_rgb_nonlinear_converter (const Babl    *conversion,
+                                          unsigned char *__restrict__ src_char,
+                                          unsigned char *__restrict__ dst_char,
+                                          long           samples,
+                                          void          *data)
+{
+  const Babl *destination_space = conversion->conversion.destination->format.space;
+  float * matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_in, rgba_out, samples);
+
+  TRC_OUT(rgba_out, rgba_out);
+}
+
+static inline void
+universal_rgba_converter (const Babl    *conversion,
+                          unsigned char *__restrict__ src_char,
+                          unsigned char *__restrict__ dst_char,
+                          long           samples,
+                          void          *data)
+{
+  float *matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_in, rgba_out, samples);
+}
+
+static inline void
+universal_rgb_converter (const Babl    *conversion,
+                         unsigned char *__restrict__ src_char,
+                         unsigned char *__restrict__ dst_char,
+                         long           samples,
+                         void          *data)
+{
+  float *matrixf = data;
+  float *rgb_in = (void*)src_char;
+  float *rgb_out = (void*)dst_char;
+
+  babl_matrix_mul_vectorff_buf3 (matrixf, rgb_in, rgb_out, samples);
+}
+
+
+static inline void
+universal_ya_converter (const Babl    *conversion,
+                        unsigned char *__restrict__ src_char,
+                        unsigned char *__restrict__ dst_char,
+                        long           samples,
+                        void          *data)
+{
+  memcpy (dst_char, src_char, samples * 4 * 2);
+}
+
+static inline void
+universal_y_converter (const Babl    *conversion,
+                       unsigned char *__restrict__ src_char,
+                       unsigned char *__restrict__ dst_char,
+                       long           samples,
+                       void          *data)
+{
+  memcpy (dst_char, src_char, samples * 4);
+}
+
+
+static inline void
+universal_nonlinear_rgb_u8_converter (const Babl    *conversion,
+                                      unsigned char *__restrict__ src_char,
+                                      unsigned char *__restrict__ dst_char,
+                                      long           samples,
+                                      void          *data)
+{
+  const Babl *destination_space = conversion->conversion.destination->format.space;
+
+  float * matrixf = data;
+  float * in_trc_lut_red = matrixf + 9;
+  float * in_trc_lut_green = in_trc_lut_red + 256;
+  float * in_trc_lut_blue = in_trc_lut_green + 256;
+  unsigned int i;
+  uint8_t *rgb_in_u8 = (void*)src_char;
+  uint8_t *rgb_out_u8 = (void*)dst_char;
+
+  float rgba_out[4*samples];
+
+  for (i = 0; i < samples; i++)
+  {
+    rgba_out[i*4+0]=in_trc_lut_red[rgb_in_u8[i*3+0]];
+    rgba_out[i*4+1]=in_trc_lut_green[rgb_in_u8[i*3+1]];
+    rgba_out[i*4+2]=in_trc_lut_blue[rgb_in_u8[i*3+2]];
+    rgba_out[i*4+3]=rgb_in_u8[i*3+2] * 255.0f;
+  }
+
+  babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
+
+  {
+    TRC_OUT(rgba_out, rgba_out);
+
+    for (i = 0; i < samples; i++)
+      for (unsigned int c = 0; c < 3; c ++)
+        rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255.0f;
+  }
+
+}
+
+
+#if defined(USE_SSE2)
+
+#define m(matr, j, i)  matr[j*3+i]
+
+#include <emmintrin.h>
+
+static inline void babl_matrix_mul_vectorff_buf4_sse2 (const float *mat,
+                                                       const float *v_in,
+                                                       float       *v_out,
+                                                       unsigned int samples)
+{
+  const __v4sf m___0 = {m(mat, 0, 0), m(mat, 1, 0), m(mat, 2, 0), 0};
+  const __v4sf m___1 = {m(mat, 0, 1), m(mat, 1, 1), m(mat, 2, 1), 0};
+  const __v4sf m___2 = {m(mat, 0, 2), m(mat, 1, 2), m(mat, 2, 2), 1};
+  unsigned int i;
+  for (i = 0; i < samples; i ++)
+  {
+    __v4sf a, b, c = _mm_load_ps(&v_in[0]);
+    a = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(0,0,0,0));
+    b = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(1,1,1,1));
+    c = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(3,2,2,2));
+    _mm_store_ps (v_out, m___0 * a + m___1 * b + m___2 * c);
+    v_out += 4;
+    v_in  += 4;
+  }
+  _mm_empty ();
+}
+
+#undef m
+
+static inline void
+universal_nonlinear_rgba_converter_sse2 (const Babl    *conversion,
+                                         unsigned char *__restrict__ src_char,
+                                         unsigned char *__restrict__ dst_char,
+                                         long           samples,
+                                         void          *data)
+{
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
+  const Babl *destination_space = babl_conversion_get_destination_space (conversion);
+  float * matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  TRC_IN(rgba_in, rgba_out);
+
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
+
+  TRC_OUT(rgba_out, rgba_out);
+}
+
+
+static inline void
+universal_rgba_converter_sse2 (const Babl *conversion,
+                               unsigned char *__restrict__ src_char,
+                               unsigned char *__restrict__ dst_char,
+                               long samples,
+                               void *data)
+{
+  float *matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_in, rgba_out, samples);
+}
+
+static inline void
+universal_nonlinear_rgb_u8_converter_sse2 (const Babl    *conversion,
+                                           unsigned char *__restrict__ src_char,
+                                           unsigned char *__restrict__ dst_char,
+                                           long           samples,
+                                           void          *data)
+{
+  const Babl *destination_space = conversion->conversion.destination->format.space;
+
+  float * matrixf = data;
+  float * in_trc_lut_red = matrixf + 9;
+  float * in_trc_lut_green = in_trc_lut_red + 256;
+  float * in_trc_lut_blue = in_trc_lut_green + 256;
+  unsigned int i;
+  uint8_t *rgb_in_u8 = (void*)src_char;
+  uint8_t *rgb_out_u8 = (void*)dst_char;
+
+  // The alignment is necessary for SIMD intrinsics in babl_matrix_mul_vectorff_buf4_sse2()
+  float __attribute__ ((aligned (16))) rgba_out[4*samples];
+
+  for (i = 0; i < samples; i++)
+  {
+    rgba_out[i*4+0]=in_trc_lut_red[rgb_in_u8[i*3+0]];
+    rgba_out[i*4+1]=in_trc_lut_green[rgb_in_u8[i*3+1]];
+    rgba_out[i*4+2]=in_trc_lut_blue[rgb_in_u8[i*3+2]];
+  }
+
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
+
+  {
+    TRC_OUT(rgba_out, rgba_out);
+
+    for (i = 0; i < samples; i++)
+      for (unsigned c = 0; c < 3; c ++)
+        rgb_out_u8[i*3+c] = rgba_out[i*4+c] * 255 + 0.5f;
+  }
+}
+
+
+static inline void
+universal_nonlinear_rgb_linear_converter_sse2 (const Babl    *conversion,
+                                               unsigned char *__restrict__ src_char,
+                                               unsigned char *__restrict__ dst_char,
+                                               long           samples,
+                                               void          *data)
+{
+  const Babl *source_space = babl_conversion_get_source_space (conversion);
+  float * matrixf = data;
+  float *rgba_in  = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  TRC_IN(rgba_in, rgba_out);
+
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
+}
+
+
+static inline void
+universal_linear_rgb_nonlinear_converter_sse2 (const Babl    *conversion,
+                                               unsigned char *__restrict__ src_char,
+                                               unsigned char *__restrict__ dst_char,
+                                               long           samples,
+                                               void          *data)
+{
+  const Babl *destination_space = conversion->conversion.destination->format.space;
+  float * matrixf = data;
+  float *rgba_in = (void*)src_char;
+  float *rgba_out = (void*)dst_char;
+
+  babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_in, rgba_out, samples);
+
+  TRC_OUT(rgba_out, rgba_out);
+}
+#endif
+
+
+static int
+add_rgb_adapter (Babl *babl,
+                 void *space)
+{
+  if (babl != space)
+  {
+
+#if defined(USE_SSE2)
+    if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE) &&
+        (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2))
+    {
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", space),
+                       babl_format_with_space("RGBA float", babl),
+                       "linear", universal_rgba_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", babl),
+                       babl_format_with_space("RGBA float", space),
+                       "linear", universal_rgba_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", space),
+                       babl_format_with_space("R'G'B'A float", babl),
+                       "linear", universal_nonlinear_rgba_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", babl),
+                       babl_format_with_space("R'G'B'A float", space),
+                       "linear", universal_nonlinear_rgba_converter_sse2,
+                       NULL));
+
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", space),
+                       babl_format_with_space("RGBA float", babl),
+                       "linear", universal_nonlinear_rgb_linear_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", babl),
+                       babl_format_with_space("RGBA float", space),
+                       "linear", universal_nonlinear_rgb_linear_converter_sse2,
+                       NULL));
+
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", babl),
+                       babl_format_with_space("R'G'B'A float", space),
+                       "linear", universal_linear_rgb_nonlinear_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", space),
+                       babl_format_with_space("R'G'B'A float", babl),
+                       "linear", universal_linear_rgb_nonlinear_converter_sse2,
+                       NULL));
+
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B' u8", space),
+                       babl_format_with_space("R'G'B' u8", babl),
+                       "linear", universal_nonlinear_rgb_u8_converter_sse2,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B' u8", babl),
+                       babl_format_with_space("R'G'B' u8", space),
+                       "linear", universal_nonlinear_rgb_u8_converter_sse2,
+                       NULL));
+    }
+    else
+#endif
+    {
+#if 1
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", space),
+                       babl_format_with_space("RGBA float", babl),
+                       "linear", universal_rgba_converter,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", babl),
+                       babl_format_with_space("RGBA float", space),
+                       "linear", universal_rgba_converter,
+                       NULL));
+
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", space),
+                       babl_format_with_space("R'G'B'A float", babl),
+                       "linear", universal_nonlinear_rgba_converter,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", babl),
+                       babl_format_with_space("R'G'B'A float", space),
+                       "linear", universal_nonlinear_rgba_converter,
+                       NULL));
+#endif
+#if 1
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", space),
+                       babl_format_with_space("RGBA float", babl),
+                       "linear", universal_nonlinear_rgb_linear_converter,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B'A float", babl),
+                       babl_format_with_space("RGBA float", space),
+                       "linear", universal_nonlinear_rgb_linear_converter,
+                       NULL));
+#endif
+
+#if 1
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B' u8", space),
+                       babl_format_with_space("R'G'B' u8", babl),
+                       "linear", universal_nonlinear_rgb_u8_converter,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("R'G'B' u8", babl),
+                       babl_format_with_space("R'G'B' u8", space),
+                       "linear", universal_nonlinear_rgb_u8_converter,
+                       NULL));
+
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", babl),
+                       babl_format_with_space("R'G'B'A float", space),
+                       "linear", universal_linear_rgb_nonlinear_converter,
+                       NULL));
+       prep_conversion(babl_conversion_new(
+                       babl_format_with_space("RGBA float", space),
+                       babl_format_with_space("R'G'B'A float", babl),
+                       "linear", universal_linear_rgb_nonlinear_converter,
+                       NULL));
+#endif
+    }
+    prep_conversion(babl_conversion_new(
+                    babl_format_with_space("RGB float", space),
+                    babl_format_with_space("RGB float", babl),
+                    "linear", universal_rgb_converter,
+                    NULL));
+    prep_conversion(babl_conversion_new(
+                    babl_format_with_space("RGB float", babl),
+                    babl_format_with_space("RGB float", space),
+                    "linear", universal_rgb_converter,
+                    NULL));
+    prep_conversion(babl_conversion_new(
+                    babl_format_with_space("Y float", space),
+                    babl_format_with_space("Y float", babl),
+                    "linear", universal_y_converter,
+                    NULL));
+    prep_conversion(babl_conversion_new(
+                    babl_format_with_space("YaA float", babl),
+                    babl_format_with_space("YaA float", space),
+                    "linear", universal_ya_converter,
+                    NULL));
+    prep_conversion(babl_conversion_new(
+                    babl_format_with_space("YA float", babl),
+                    babl_format_with_space("YA float", space),
+                    "linear", universal_ya_converter,
+                    NULL));
+  }
+  return 0;
+}
+
+/* The first time a new Babl space is used - for creation of a fish, is when
+ * this function is called, it adds conversions hooks that provides its formats
+ * with conversions internally as well as for conversions to and from other RGB
+ * spaces.
+ */
+void
+BABL_SIMD_SUFFIX(_babl_space_add_universal_rgb) (const Babl *space);
+void
+BABL_SIMD_SUFFIX(_babl_space_add_universal_rgb) (const Babl *space)
+{
+  babl_space_class_for_each (add_rgb_adapter, (void*)space);
+}
diff --git a/babl/babl-trc.c b/babl/base/babl-trc.c
index e76bb92..09beb07 100644
--- a/babl/babl-trc.c
+++ b/babl/base/babl-trc.c
@@ -26,6 +26,7 @@
 
 #include "config.h"
 #include "babl-internal.h"
+#include "babl-base.h"
 #include "base/util.h"
 
 static BablTRC trc_db[MAX_TRCS];
@@ -51,13 +52,13 @@ babl_trc_lut_from_linear (const Babl *trc_,
   if (entry >= trc->lut_size -1)
   {
     entry = trc->lut_size - 1;
-    diff = 0.0;
+    diff = 0.0f;
   }
   else if (entry < 0) entry = 0;
 
-  if (diff > 0.0)
+  if (diff > 0.0f)
   {
-    ret = trc->inv_lut[entry] * (1.0 - diff) + trc->inv_lut[entry+1] * diff;
+    ret = trc->inv_lut[entry] * (1.0f - diff) + trc->inv_lut[entry+1] * diff;
   }
   else
   {
@@ -80,9 +81,9 @@ babl_trc_lut_to_linear (const Babl *trc_,
   if (entry >= trc->lut_size) entry = trc->lut_size - 1;
   else if (entry < 0) entry = 0;
 
-  if (diff > 0.0 && entry < trc->lut_size - 1)
+  if (diff > 0.0f && entry < trc->lut_size - 1)
   {
-    ret = trc->lut[entry] * (1.0 - diff) + trc->lut[entry+1] * diff;
+    ret = trc->lut[entry] * (1.0f - diff) + trc->lut[entry+1] * diff;
   }
   else
   {
@@ -127,32 +128,48 @@ _babl_trc_gamma_from_linear (const Babl *trc_,
 
 static inline void 
 _babl_trc_gamma_to_linear_buf (const Babl  *trc_, 
-                               const float *in, 
-                               float       *out, 
+                               const float *__restrict__ in, 
+                               float       *__restrict__ out, 
                                int          in_gap, 
                                int          out_gap, 
                                int          components, 
                                int          count)
 {
-  int i, c;
-  for (i = 0; i < count; i ++)
-    for (c = 0; c < components; c ++)
-      out[out_gap * i + c] = _babl_trc_gamma_to_linear (trc_, in[in_gap *i + c]);
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < 3; c ++)
+        out[4 * i + c] = _babl_trc_gamma_to_linear (trc_, in[4 *i + c]);
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[out_gap * i + c] = _babl_trc_gamma_to_linear (trc_, in[in_gap *i + c]);
+  }
 }
 
 static inline void 
 _babl_trc_gamma_from_linear_buf (const Babl  *trc_, 
-                                 const float *in, 
-                                 float       *out, 
+                                 const float *__restrict__ in, 
+                                 float       *__restrict__ out, 
                                  int          in_gap, 
                                  int          out_gap, 
                                  int          components, 
                                  int          count)
 {
-  int i, c;
-  for (i = 0; i < count; i ++)
-    for (c = 0; c < components; c ++)
-      out[out_gap * i + c] = _babl_trc_gamma_from_linear (trc_, in[in_gap *i + c]);
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < 3; c ++)
+        out[4 * i + c] = _babl_trc_gamma_from_linear (trc_, in[4 *i + c]);
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[out_gap * i + c] = _babl_trc_gamma_from_linear (trc_, in[in_gap *i + c]);
+  }
 }
 
 static inline float 
@@ -165,17 +182,20 @@ _babl_trc_formula_srgb_from_linear (const Babl *trc_,
   float b = trc->lut[2];
   float c = trc->lut[3];
   float d = trc->lut[4];
-  if (x > c * d)  // XXX: verify that this math is the correct inverse
+  float e = trc->lut[5];
+  float f = trc->lut[6];
+
+  if (x - f > c * d)  // XXX: verify that this math is the correct inverse
   {
-    float v = _babl_trc_gamma_from_linear ((Babl *) trc, x);
+    float v = _babl_trc_gamma_from_linear ((Babl *) trc, x - f);
     v = (v-b)/a;
-    if (v < 0.0 || v >= 0.0)
+    if (v < 0.0f || v >= 0.0f)
       return v;
-    return 0.0;
+    return 0.0f;
   }
-  if (c > 0.0)
-    return x / c;
-  return 0.0;
+  if (c > 0.0f)
+    return (x - e) / c;
+  return 0.0f;
 }
 
 static inline float 
@@ -188,15 +208,55 @@ _babl_trc_formula_srgb_to_linear (const Babl *trc_,
   float b = trc->lut[2];
   float c = trc->lut[3];
   float d = trc->lut[4];
+  float e = trc->lut[5];
+  float f = trc->lut[6];
+
+  if (x >= d)  // OPT can be reduced to be branchless
+  {
+    return _babl_trc_gamma_to_linear ((Babl *) trc, a * x + b) + e;
+  }
+  return c * x + f;
+}
+static inline float 
+_babl_trc_formula_cie_from_linear (const Babl *trc_, 
+                                   float       value)
+{
+  BablTRC *trc = (void*)trc_;
+  float x= value;
+  float a = trc->lut[1];
+  float b = trc->lut[2];
+  float c = trc->lut[3];
 
-  if (x >= d)
+  if (x > c)
   {
-    return _babl_trc_gamma_to_linear ((Babl *) trc, a * x + b);
+    float v = _babl_trc_gamma_from_linear ((Babl *) trc, x - c);
+    v = (v-b)/a;
+    if (v < 0.0f || v >= 0.0f)
+      return v;
   }
-  return c * x;
+  return 0.0f;
 }
 
 static inline float 
+_babl_trc_formula_cie_to_linear (const Babl *trc_, 
+                                 float       value)
+{
+  BablTRC *trc = (void*)trc_;
+  float x= value;
+  float a = trc->lut[1];
+  float b = trc->lut[2];
+  float c = trc->lut[3];
+
+  if (x >= -b / a)
+  {
+    return _babl_trc_gamma_to_linear ((Babl *) trc, a * x + b) + c;
+  }
+  return c;
+}
+
+
+
+static inline float 
 _babl_trc_srgb_to_linear (const Babl *trc_, 
                           float       value)
 {
@@ -219,10 +279,18 @@ _babl_trc_srgb_to_linear_buf (const Babl  *trc_,
                               int          components, 
                               int          count)
 {
-  int i, c;
-  for (i = 0; i < count; i ++)
-    for (c = 0; c < components; c++)
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+  for (int i = 0; i < count; i ++)
+    for (int c = 0; c < 3; c++)
+      out[4 * i + c] = babl_gamma_2_2_to_linearf (in[4 * i + c]);
+  }
+  else
+  {
+  for (int i = 0; i < count; i ++)
+    for (int c = 0; c < components; c++)
       out[out_gap * i + c] = babl_gamma_2_2_to_linearf (in[in_gap * i + c]);
+  }
 }
 
 static inline void 
@@ -234,61 +302,97 @@ _babl_trc_srgb_from_linear_buf (const Babl  *trc_,
                                 int          components,
                                 int          count)
 {
-  int i, c;
-  for (i = 0; i < count; i ++)
-    for (c = 0; c < components; c++)
-      out[out_gap * i + c] = babl_linear_to_gamma_2_2f (in[in_gap * i + c]);
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+      for (int i = 0; i < count; i ++)
+       for (int c = 0; c < 3; c++)
+         out[4 * i + c] = babl_linear_to_gamma_2_2f (in[4 * i + c]);
+  }
+  else
+  {
+     for (int i = 0; i < count; i ++)
+       for (int c = 0; c < components; c++)
+         out[out_gap * i + c] = babl_linear_to_gamma_2_2f (in[in_gap * i + c]);
+  }
 }
 
 static inline void 
 _babl_trc_to_linear_buf_generic (const Babl  *trc_, 
-                                 const float *in, 
-                                 float       *out, 
+                                 const float *__restrict__ in, 
+                                 float       *__restrict__ out, 
                                  int          in_gap, 
                                  int          out_gap, 
                                  int          components, 
                                  int          count)
 {
-  int i, c;
   BablTRC *trc = (void*)trc_;
-  for (i = 0; i < count; i ++)
-    for (c = 0; c < components; c ++)
-      out[out_gap * i + c] = trc->fun_to_linear (trc_, in[in_gap * i + c]);
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < 3; c ++)
+        out[4 * i + c] = trc->fun_to_linear (trc_, in[4 * i + c]);
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[out_gap * i + c] = trc->fun_to_linear (trc_, in[in_gap * i + c]);
+  }
 }
 
 static inline void 
 _babl_trc_from_linear_buf_generic (const Babl  *trc_,
-                                   const float *in, 
-                                   float       *out,
+                                   const float *__restrict__ in, 
+                                   float       *__restrict__ out,
                                    int          in_gap, 
                                    int          out_gap,
                                    int          components,
                                    int          count)
 {
-  int i, c;
   BablTRC *trc = (void*)trc_;
-  for (i = 0; i < count; i ++)
-    for (c = 0; c < components; c ++)
-      out[out_gap * i + c] = trc->fun_from_linear (trc_, in[in_gap * i + c]);
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < 3; c ++)
+        out[4 * i + c] = trc->fun_from_linear (trc_, in[4 * i + c]);
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[out_gap * i + c] = trc->fun_from_linear (trc_, in[in_gap * i + c]);
+  }
 }
 
+
+
 static inline void _babl_trc_linear_buf (const Babl  *trc_,
-                                         const float *in, 
-                                         float       *out,
+                                         const float *__restrict__ in, 
+                                         float       *__restrict__ out,
                                          int          in_gap, 
                                          int          out_gap,
                                          int          components,
                                          int          count)
 {
-  int i, c;
-  for (i = 0; i < count; i ++)
-    for (c = 0; c < components; c ++)
-      out[i * out_gap + c] = in[i * in_gap + c];
+  if (in_gap == out_gap && in_gap == 4 && components == 3)
+  {
+     for (int i = 0; i < count; i ++)
+       for (int c = 0; c < 3; c ++)
+         out[i * 4 + c] = in[i * 4 + c];
+  }
+  else
+  {
+    for (int i = 0; i < count; i ++)
+      for (int c = 0; c < components; c ++)
+        out[i * out_gap + c] = in[i * in_gap + c];
+  }
 }
 
+const Babl *
+BABL_SIMD_SUFFIX (babl_trc_lookup_by_name) (const char *name);
 
 const Babl *
-babl_trc (const char *name)
+BABL_SIMD_SUFFIX (babl_trc_lookup_by_name) (const char *name)
 {
   int i;
   for (i = 0; trc_db[i].instance.class_type; i++)
@@ -301,7 +405,14 @@ babl_trc (const char *name)
 }
 
 const Babl *
-babl_trc_new (const char *name,
+BABL_SIMD_SUFFIX (babl_trc_new) (const char *name,
+              BablTRCType type,
+              double      gamma,
+              int         n_lut,
+              float      *lut);
+
+const Babl *
+BABL_SIMD_SUFFIX (babl_trc_new) (const char *name,
               BablTRCType type,
               double      gamma,
               int         n_lut,
@@ -368,7 +479,7 @@ babl_trc_new (const char *name,
       for (k = 0; k < 16; k++)
       {
         double guess = (min + max) / 2;
-        float reversed_index = babl_trc_lut_to_linear (BABL(&trc_db[i]), guess) * (n_lut-1.0);
+        float reversed_index = babl_trc_lut_to_linear (BABL(&trc_db[i]), guess) * (n_lut-1.0f);
 
         if (reversed_index < j)
         {
@@ -416,11 +527,38 @@ babl_trc_new (const char *name,
                                          trc_db[i].poly_gamma_from_linear_x1,
                                          POLY_GAMMA_DEGREE, POLY_GAMMA_SCALE);
       break;
+    case BABL_TRC_FORMULA_CIE:
+      trc_db[i].lut = babl_calloc (sizeof (float), 4);
+      {
+        int j;
+        for (j = 0; j < 4; j++)
+          trc_db[i].lut[j] = lut[j];
+      }
+      trc_db[i].fun_to_linear = _babl_trc_formula_cie_to_linear;
+      trc_db[i].fun_from_linear = _babl_trc_formula_cie_from_linear;
+
+      trc_db[i].poly_gamma_to_linear_x0 = lut[4];
+      trc_db[i].poly_gamma_to_linear_x1 = POLY_GAMMA_X1;
+      babl_polynomial_approximate_gamma (&trc_db[i].poly_gamma_to_linear,
+                                         trc_db[i].gamma,
+                                         trc_db[i].poly_gamma_to_linear_x0,
+                                         trc_db[i].poly_gamma_to_linear_x1,
+                                         POLY_GAMMA_DEGREE, POLY_GAMMA_SCALE);
+
+      trc_db[i].poly_gamma_from_linear_x0 = lut[3] * lut[4];
+      trc_db[i].poly_gamma_from_linear_x1 = POLY_GAMMA_X1;
+      babl_polynomial_approximate_gamma (&trc_db[i].poly_gamma_from_linear,
+                                         trc_db[i].rgamma,
+                                         trc_db[i].poly_gamma_from_linear_x0,
+                                         trc_db[i].poly_gamma_from_linear_x1,
+                                         POLY_GAMMA_DEGREE, POLY_GAMMA_SCALE);
+      break;
+
     case BABL_TRC_FORMULA_SRGB:
-      trc_db[i].lut = babl_calloc (sizeof (float), 5);
+      trc_db[i].lut = babl_calloc (sizeof (float), 7);
       {
         int j;
-        for (j = 0; j < 5; j++)
+        for (j = 0; j < 7; j++)
           trc_db[i].lut[j] = lut[j];
       }
       trc_db[i].fun_to_linear = _babl_trc_formula_srgb_to_linear;
@@ -456,17 +594,13 @@ babl_trc_new (const char *name,
   return (Babl*)&trc_db[i];
 }
 
-const Babl * 
-babl_trc_lut (const char *name, 
-              int         n, 
-              float      *entries)
-{
-  return babl_trc_new (name, BABL_TRC_LUT, 0, n, entries);
-}
+void
+BABL_SIMD_SUFFIX(babl_trc_class_for_each) (BablEachFunction each_fun,
+                                           void            *user_data);
 
 void
-babl_trc_class_for_each (BablEachFunction each_fun,
-                         void            *user_data)
+BABL_SIMD_SUFFIX(babl_trc_class_for_each) (BablEachFunction each_fun,
+                                           void            *user_data)
 {
   int i=0;
   for (i = 0; trc_db[i].instance.class_type; i++)
@@ -474,141 +608,3 @@ babl_trc_class_for_each (BablEachFunction each_fun,
       return;
 }
 
-const Babl *
-babl_trc_formula_srgb (double g, 
-                       double a, 
-                       double b, 
-                       double c, 
-                       double d)
-{
-  char name[128];
-  int i;
-  float params[5]={g, a, b, c, d};
-
-  if (fabs (g - 2.400) < 0.01 &&
-      fabs (a - 0.947) < 0.01 &&
-      fabs (b - 0.052) < 0.01 &&
-      fabs (c - 0.077) < 0.01 &&
-      fabs (d - 0.040) < 0.01)
-    return babl_trc ("sRGB");
-
-  snprintf (name, sizeof (name), "%.6f %.6f %.4f %.4f %.4f", g, a, b, c, d);
-  for (i = 0; name[i]; i++)
-    if (name[i] == ',') name[i] = '.';
-  while (name[strlen(name)-1]=='0')
-    name[strlen(name)-1]='\0';
-  return babl_trc_new (name, BABL_TRC_FORMULA_SRGB, g, 0, params);
-}
-
-const Babl *
-babl_trc_gamma (double gamma)
-{
-  char name[32];
-  int i;
-  if (fabs (gamma - 1.0) < 0.01)
-     return babl_trc_new ("linear", BABL_TRC_LINEAR, 1.0, 0, NULL);
-
-  snprintf (name, sizeof (name), "%.6f", gamma);
-  for (i = 0; name[i]; i++)
-    if (name[i] == ',') name[i] = '.';
-  while (name[strlen(name)-1]=='0')
-    name[strlen(name)-1]='\0';
-  return babl_trc_new (name, BABL_TRC_FORMULA_GAMMA, gamma, 0, NULL);
-}
-
-void
-babl_trc_class_init (void)
-{
-  babl_trc_new ("sRGB",  BABL_TRC_SRGB, 2.2, 0, NULL);
-  babl_trc_gamma (2.2);
-  babl_trc_gamma (1.8);
-  babl_trc_gamma (1.0);
-  babl_trc_new ("linear", BABL_TRC_LINEAR, 1.0, 0, NULL);
-}
-
-#if 0
-float 
-babl_trc_from_linear (const Babl *trc_, 
-                      float       value)
-{
-  return babl_trc_from_linear (trc_, value);
-}
-
-float 
-babl_trc_to_linear (const Babl *trc_,
-                    float       value)
-{
-  return babl_trc_to_linear (trc_, value);
-}
-#endif
-
-static int
-babl_lut_match_gamma (float *lut, 
-                      int    lut_size, 
-                      float  gamma)
-{
-  int match = 1;
-  int i;
-  if (lut_size > 1024)
-  {
-    for (i = 0; match && i < lut_size; i++)
-    {
-      if (fabs (lut[i] - pow ((i / (lut_size-1.0)), gamma)) > 0.0001)
-        match = 0;
-    }
-  }
-  else
-  {
-    for (i = 0; match && i < lut_size; i++)
-    {
-      if (fabs (lut[i] - pow ((i / (lut_size-1.0)), gamma)) > 0.001)
-        match = 0;
-    }
-  }
-  return match;
-}
-
-const Babl *
-babl_trc_lut_find (float *lut, 
-                   int    lut_size)
-{
-  int i;
-  int match = 1;
-
-  /* look for linear match */
-  for (i = 0; match && i < lut_size; i++)
-    if (fabs (lut[i] - i / (lut_size-1.0)) > 0.015)
-      match = 0;
-  if (match)
-    return babl_trc_gamma (1.0);
-
-  /* look for sRGB match: */
-  match = 1;
-  if (lut_size > 1024)
-  {
-    for (i = 0; match && i < lut_size; i++)
-    {
-      if (fabs (lut[i] - gamma_2_2_to_linear (i / (lut_size-1.0))) > 0.0001)
-        match = 0;
-    }
-  }
-  else
-  {
-    for (i = 0; match && i < lut_size; i++)
-    {
-      if (fabs (lut[i] - gamma_2_2_to_linear (i / (lut_size-1.0))) > 0.001)
-        match = 0;
-    }
-  }
-  if (match)
-    return babl_trc ("sRGB");
-
-  if (babl_lut_match_gamma (lut, lut_size, 2.2))
-    return babl_trc_gamma(2.2);
-
-  if (babl_lut_match_gamma (lut, lut_size, 1.8))
-    return babl_trc_gamma(1.8);
-
-  return NULL;
-}
-
diff --git a/babl/babl-trc.h b/babl/base/babl-trc.h
index 380a736..1901fd2 100644
--- a/babl/babl-trc.h
+++ b/babl/base/babl-trc.h
@@ -30,7 +30,8 @@ typedef enum {BABL_TRC_LINEAR,
               BABL_TRC_FORMULA_GAMMA,
               BABL_TRC_SRGB,
               BABL_TRC_FORMULA_SRGB,
-              BABL_TRC_LUT}
+              BABL_TRC_LUT,
+              BABL_TRC_FORMULA_CIE}
 BablTRCType;
 
 typedef struct
@@ -66,6 +67,8 @@ typedef struct
   float           *lut;
   float           *inv_lut;
   char             name[128];
+  int valid_u8_lut;
+  float u8_lut[256];
 } BablTRC;
 
 static inline void babl_trc_from_linear_buf (const Babl *trc_,
@@ -101,6 +104,7 @@ static inline float babl_trc_to_linear (const Babl *trc_, float value)
 }
 
 void
-babl_trc_class_init (void);
+babl_trc_class_init_generic (void);
+
 
 #endif
diff --git a/babl/base/formats.c b/babl/base/formats.c
index bad9d14..03488c4 100644
--- a/babl/base/formats.c
+++ b/babl/base/formats.c
@@ -25,7 +25,7 @@
 #include "babl-base.h"
 
 void
-babl_formats_init (void)
+BABL_SIMD_SUFFIX (babl_formats_init) (void)
 {
   const Babl *types[]={
     babl_type_from_id (BABL_DOUBLE),
@@ -35,7 +35,7 @@ babl_formats_init (void)
     babl_type_from_id (BABL_U16),
     babl_type_from_id (BABL_U32)
   };
-  for (int i = 0; i < sizeof (types)/sizeof(types[0]);i++)
+  for (size_t i = 0; i < sizeof (types) / sizeof(types[0]); i++)
   {
     const Babl *type = types[i];
 
diff --git a/babl/base/meson.build b/babl/base/meson.build
index a78fd84..e59609c 100644
--- a/babl/base/meson.build
+++ b/babl/base/meson.build
@@ -14,10 +14,43 @@ babl_base_sources = [
   'type-u16.c',
   'type-u32.c',
   'type-u8.c',
+  'babl-trc.c',
+  'babl-rgb-converter.c',
 ]
 
 babl_base = static_library('babl_base',
   babl_base_sources,
   include_directories: [rootInclude, bablInclude],
   dependencies: [math, lcms],
-)
+   c_args: common_c_flags + [sse2_cflags]
+) 
+
+if host_cpu_family == 'x86_64'
+
+  babl_base_x86_64_v2 = static_library('babl_base-x86-64-v2',
+    babl_base_sources,
+    include_directories: [rootInclude, bablInclude],
+    dependencies: [math, lcms],
+    c_args: common_c_flags + x86_64_v2_flags
+  )
+
+  babl_base_x86_64_v3 = static_library('babl_base-x86-64-v3',
+    babl_base_sources,
+    include_directories: [rootInclude, bablInclude],
+    dependencies: [math, lcms],
+    c_args: common_c_flags + x86_64_v3_flags
+  )
+
+endif
+
+
+if host_cpu_family == 'arm'
+
+  babl_base_arm_neon = static_library('babl_base-arm-neon',
+    babl_base_sources,
+    include_directories: [rootInclude, bablInclude],
+    dependencies: [math, lcms],
+    c_args: common_c_flags + arm_neon_flags
+  )
+
+endif
diff --git a/babl/base/model-cmyk.c b/babl/base/model-cmyk.c
index 13fdedf..1fa02be 100644
--- a/babl/base/model-cmyk.c
+++ b/babl/base/model-cmyk.c
@@ -613,7 +613,7 @@ cmy_to_rgba (const Babl *conversion,
 #endif
 
 void
-babl_base_model_cmyk (void)
+BABL_SIMD_SUFFIX (babl_base_model_cmyk) (void)
 {
   babl_component_new ("cyan", NULL);
   babl_component_new ("yellow", NULL);
diff --git a/babl/base/model-gray.c b/babl/base/model-gray.c
index 3862400..7441baa 100644
--- a/babl/base/model-gray.c
+++ b/babl/base/model-gray.c
@@ -31,7 +31,7 @@ static void formats (void);
 static void init_single_precision (void);
 
 void 
-babl_base_model_gray (void)
+BABL_SIMD_SUFFIX (babl_base_model_gray) (void)
 {
   components ();
   models ();
@@ -90,7 +90,6 @@ models (void)
     "linear",
     NULL);
 
-
   babl_model_new (
     "id", BABL_GRAY_ALPHA,
     babl_component_from_id (BABL_GRAY_LINEAR),
diff --git a/babl/base/model-rgb.c b/babl/base/model-rgb.c
index a3064ef..824665a 100644
--- a/babl/base/model-rgb.c
+++ b/babl/base/model-rgb.c
@@ -32,7 +32,7 @@ static void formats (void);
 static void init_single_precision (void);
 
 void
-babl_base_model_rgb (void)
+BABL_SIMD_SUFFIX (babl_base_model_rgb) (void)
 {
   components ();
   models ();
diff --git a/babl/base/model-ycbcr.c b/babl/base/model-ycbcr.c
index 64db6a2..e061298 100644
--- a/babl/base/model-ycbcr.c
+++ b/babl/base/model-ycbcr.c
@@ -34,7 +34,7 @@ static void conversions (void);
 static void formats (void);
 
 void
-babl_base_model_ycbcr (void)
+BABL_SIMD_SUFFIX (babl_base_model_ycbcr) (void)
 {
   components ();
   models ();
diff --git a/babl/base/pow-24.h b/babl/base/pow-24.h
index ecd1282..98e2374 100644
--- a/babl/base/pow-24.h
+++ b/babl/base/pow-24.h
@@ -98,7 +98,7 @@ static inline float babl_frexpf(float x, int *e)
 
         if (!ee) {
                 if (x) {
-                        x = babl_frexpf(x*18446744073709551616.0, e);
+                        x = babl_frexpf(x*18446744073709551616.0f, e);
                         *e -= 64;
                 } else *e = 0;
                 return x;
@@ -130,11 +130,12 @@ static inline float babl_frexpf(float x, int *e)
 static inline float
 init_newtonf (float x, float exponent, float c0, float c1, float c2)
 {
+#define fM_LN2 0.69314718055994530942f
     int iexp = 0;
     float y = babl_frexpf(x, &iexp);
     y = 2*y+(iexp-2);
-    c1 *= M_LN2*exponent;
-    c2 *= M_LN2*M_LN2*exponent*exponent;
+    c1 *= fM_LN2*exponent;
+    c2 *= fM_LN2*fM_LN2*exponent*exponent;
     return y = c0 + c1*y + c2*y*y;
 }
 
diff --git a/babl/base/type-float.c b/babl/base/type-float.c
index 5b03b3f..9517831 100644
--- a/babl/base/type-float.c
+++ b/babl/base/type-float.c
@@ -83,7 +83,7 @@ convert_float_float (const Babl *babl,
 
 
 void
-babl_base_type_float (void)
+BABL_SIMD_SUFFIX (babl_base_type_float) (void)
 {
   babl_type_new (
     "float",
diff --git a/babl/base/type-half.c b/babl/base/type-half.c
index 862d662..a146185 100644
--- a/babl/base/type-half.c
+++ b/babl/base/type-half.c
@@ -395,7 +395,7 @@ convert_half_float (BablConversion *conversion,
 
 
 void
-babl_base_type_half (void)
+BABL_SIMD_SUFFIX (babl_base_type_half) (void)
 {
   babl_type_new (
     "half",
diff --git a/babl/base/type-u15.c b/babl/base/type-u15.c
index ea35453..7224c63 100644
--- a/babl/base/type-u15.c
+++ b/babl/base/type-u15.c
@@ -198,7 +198,7 @@ convert_u15_float_scaled (BablConversion *conversion,
 MAKE_CONVERSIONS_float (u15, 0.0, 1.0, 0, (1<<15))
 
 void
-babl_base_type_u15 (void)
+BABL_SIMD_SUFFIX (babl_base_type_u15) (void)
 {
   babl_hmpf_on_name_lookups--;
   babl_type_new (
diff --git a/babl/base/type-u16.c b/babl/base/type-u16.c
index c5a41dc..e7ab936 100644
--- a/babl/base/type-u16.c
+++ b/babl/base/type-u16.c
@@ -196,7 +196,7 @@ MAKE_CONVERSIONS_float (u16, 0.0, 1.0, 0, UINT16_MAX)
 
 
 void
-babl_base_type_u16 (void)
+BABL_SIMD_SUFFIX (babl_base_type_u16) (void)
 {
   babl_type_new (
     "u16",
diff --git a/babl/base/type-u32.c b/babl/base/type-u32.c
index 48b1506..7d07ff1 100644
--- a/babl/base/type-u32.c
+++ b/babl/base/type-u32.c
@@ -69,7 +69,7 @@ convert_u32_double_scaled (BablConversion *c,
 {
   while (n--)
     {
-      int    u32val = *(uint32_t *) src;
+      uint32_t    u32val = *(uint32_t *) src;
       double dval;
 
       if (u32val < min)
@@ -154,7 +154,7 @@ convert_u32_float_scaled (BablConversion *c,
 {
   while (n--)
     {
-      int    u32val = *(uint32_t *) src;
+      uint32_t u32val = *(uint32_t *) src;
       float dval;
 
       if (u32val < min)
@@ -196,7 +196,7 @@ MAKE_CONVERSIONS_float(u32, 0.0, 1.0, 0, UINT32_MAX)
 
 
 void
-babl_base_type_u32 (void)
+BABL_SIMD_SUFFIX (babl_base_type_u32) (void)
 {
   babl_type_new (
     "u32",
diff --git a/babl/base/type-u8.c b/babl/base/type-u8.c
index d41d5e0..9abbf67 100644
--- a/babl/base/type-u8.c
+++ b/babl/base/type-u8.c
@@ -202,7 +202,7 @@ MAKE_CONVERSIONS_float (u8_chroma, -0.5, 0.5, 16, 240)
 
 
 void
-babl_base_type_u8 (void)
+BABL_SIMD_SUFFIX (babl_base_type_u8) (void)
 {
   babl_type_new (
     "u8",
diff --git a/babl/base/util.h b/babl/base/util.h
index aba9c61..0d50363 100644
--- a/babl/base/util.h
+++ b/babl/base/util.h
@@ -50,23 +50,17 @@
 static inline double
 babl_epsilon_for_zero (double value)
 {
- if (value <=  BABL_ALPHA_FLOOR &&
-     value >= -BABL_ALPHA_FLOOR)
- {
-   return BABL_ALPHA_FLOOR;
- }
- return value;
+   return value * (value >  BABL_ALPHA_FLOOR || value < -BABL_ALPHA_FLOOR) +
+          BABL_ALPHA_FLOOR * (value <=  BABL_ALPHA_FLOOR &&
+                              value >= -BABL_ALPHA_FLOOR);
 }
 
 static inline float
 babl_epsilon_for_zero_float (float value)
 {
- if (value <= BABL_ALPHA_FLOOR_F &&
-     value >= -BABL_ALPHA_FLOOR_F)
- {
-   return BABL_ALPHA_FLOOR_F;
- }
- return value;
+   return value * (value >  BABL_ALPHA_FLOOR_F || value < -BABL_ALPHA_FLOOR_F) +
+          BABL_ALPHA_FLOOR_F * (value <=  BABL_ALPHA_FLOOR_F &&
+                              value >= -BABL_ALPHA_FLOOR_F);
 }
 
 
diff --git a/babl/git-version.h b/babl/git-version.h
deleted file mode 100644
index 5f243e8..0000000
--- a/babl/git-version.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __GIT_VERSION_H__
-#define __GIT_VERSION_H__
-
-#define BABL_GIT_VERSION "BABL_0_1_80"
-
-#endif /* __GIT_VERSION_H__ */
diff --git a/babl/git-version.h.in b/babl/git-version.h.in
deleted file mode 100644
index a9f87e2..0000000
--- a/babl/git-version.h.in
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __GIT_VERSION_H__
-#define __GIT_VERSION_H__
-
-#define BABL_GIT_VERSION "@BABL_GIT_VERSION@"
-
-#endif /* __GIT_VERSION_H__ */
diff --git a/babl/meson.build b/babl/meson.build
index fed8fe9..6e7c57f 100644
--- a/babl/meson.build
+++ b/babl/meson.build
@@ -13,10 +13,10 @@ babl_c_args = [
 
 # symbol maps
 version_script = custom_target('babl.map',
-  input : meson.source_root() / 'export-symbols',
+  input : export_symbols_file,
   output: ['babl.map', 'babl.map.clang'],
   command: [
-  find_program(meson.source_root() / 'gen_babl_map.py'),
+  find_program(gen_babl_map_file),
   '@INPUT@',
   '@OUTPUT0@',
   ],
@@ -54,43 +54,14 @@ if platform_win32
   babl_link_args += no_undefined
 endif
 
+# sources
 babl_version_h = configure_file(
   input:  'babl-version.h.in',
   output: 'babl-version.h',
   configuration: conf,
 )
 
-# If git is available, always check if git-version.h should be
-# updated. If git is not available, don't do anything if git-version.h
-# already exists because then we are probably working with a tarball
-# in which case the git-version.h we ship is correct.
-if git_bin.found() and run_command(
-    git_bin,
-    'rev-parse',
-    '--is-inside-work-tree',
-).returncode() == 0
-  git_version_h = vcs_tag(
-    input : 'git-version.h.in',
-    output: 'git-version.h',
-    replace_string: '@BABL_GIT_VERSION@',
-    command: [ git_bin.path(), 'describe', '--always' ],
-  )
-
-  if env_bin.found()
-    meson.add_dist_script(
-      [ 'ninja', 'babl/git-version.h', ],
-    )
-    meson.add_dist_script(
-      [ 'sh', '-c', ' '.join(
-      [ 'cp', git_version_h.full_path(), '${MESON_DIST_ROOT}/babl' ]
-      )]
-    )
-  endif
-else
-  git_version_h = files('git-version.h')
-endif
-
-babl_sources = [
+babl_sources = files(
   'babl-cache.c',
   'babl-component.c',
   'babl-conversion.c',
@@ -118,36 +89,56 @@ babl_sources = [
   'babl-sampling.c',
   'babl-sanity.c',
   'babl-space.c',
-  'babl-trc.c',
   'babl-type.c',
   'babl-util.c',
   'babl-version.c',
   'babl.c',
+) + [
   babl_version_h,
   git_version_h,
 ]
 
-babl_headers = [
+babl_headers = files(
   'babl-introspect.h',
   'babl-macros.h',
   'babl-types.h',
   'babl.h',
+) + [
   babl_version_h,
 ]
 
 install_headers(babl_headers,
-  subdir: join_paths(lib_name, 'babl')
+  subdir: lib_name / 'babl'
 )
+# copy external headers to babl subdirectory for subproject builds as
+# we don't want to expose the project root folder due to potential
+# name clashes.
+if meson.is_subproject()
+  subdir('babl')
+endif
 
+babl_deps = [math, thread, dl, lcms]
+babl_includes = [rootInclude, bablBaseInclude]
+
+if host_cpu_family == 'x86_64'
+  simd_extra = [babl_base_x86_64_v2, babl_base_x86_64_v3]
+elif host_cpu_family == 'arm'
+  simd_extra = [babl_base_arm_neon]
+else
+  simd_extra = []
+endif
+
+# build library
 babl = library(
   lib_name,
   babl_sources,
-  include_directories: [rootInclude, bablBaseInclude],
+  include_directories: babl_includes,
   c_args: babl_c_args,
   link_whole: babl_base,
   link_args: babl_link_args,
-  dependencies: [math, thread, dl, lcms],
-  link_depends: version_script,
+  link_with: simd_extra,
+  dependencies: babl_deps,
+  link_depends: version_script[0],
   version: so_version,
   install: true,
 )
@@ -165,13 +156,16 @@ if build_gir
     namespace: 'Babl',
     nsversion: api_version,
     header: 'babl.h',
+    export_packages: 'babl-0.1',
     install: true,
   )
 
   if build_vapi
-    gnome.generate_vapi(lib_name,
+    babl_vapi = gnome.generate_vapi(lib_name,
       sources: babl_gir[0],
       install: true,
     )
   endif
+else
+  babl_gir = []
 endif
diff --git a/bin/babl.c b/bin/babl.c
new file mode 100644
index 0000000..4e1a6dc
--- /dev/null
+++ b/bin/babl.c
@@ -0,0 +1,529 @@
+/* babl - dynamically extendable universal pixel conversion tool.
+ * Copyright (C) 2022 Jehan
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <https://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <babl/babl.h>
+
+
+static const Babl * babl_cli_get_space   (const char    *path,
+                                          BablIccIntent  intent);
+static void         babl_cli_print_usage (FILE          *stream);
+
+
+int
+main (int    argc,
+      char **argv)
+{
+  const Babl    *from_format;
+  const Babl    *from_space       = NULL;
+  const Babl    *to_format;
+  const Babl    *to_space         = NULL;
+  const Babl    *fish;
+  const char    *from             = NULL;
+  const char    *to               = NULL;
+  const char    *from_profile     = NULL;
+  const char    *to_profile       = NULL;
+  BablIccIntent  intent           = BABL_ICC_INTENT_RELATIVE_COLORIMETRIC;
+  char          *source;
+  char          *dest;
+  int            set_from         = 0;
+  int            set_to           = 0;
+  int            set_from_profile = 0;
+  int            set_to_profile   = 0;
+  int            set_intent       = 0;
+  int            brief_output     = 0;
+  int            options_ended    = 0;
+  int            n_components;
+  int            data_index;
+  int            c;
+  int            i;
+
+  babl_init ();
+
+  if (argc == 1)
+    {
+      babl_cli_print_usage (stderr);
+      return 2;
+    }
+
+  /* Looping through arguments to get source and destination formats. */
+  for (i = 1; i < argc; i++)
+    {
+      if (set_from)
+        {
+          from = argv[i];
+          set_from = 0;
+          if (! babl_format_exists (from))
+            {
+              fprintf (stderr, "babl: unknown format: %s\n", from);
+              return 1;
+            }
+        }
+      else if (set_to)
+        {
+          to = argv[i];
+          set_to = 0;
+          if (! babl_format_exists (to))
+            {
+              fprintf (stderr, "babl: unknown format: %s\n", to);
+              return 1;
+            }
+        }
+      else if (set_from_profile)
+        {
+          set_from_profile = 0;
+          from_profile = argv[i];
+        }
+      else if (set_to_profile)
+        {
+          set_to_profile = 0;
+          to_profile = argv[i];
+        }
+      else if (set_intent)
+        {
+          set_intent = 0;
+
+          if (strcmp (argv[i], "perceptual") == 0)
+            {
+              intent = BABL_ICC_INTENT_PERCEPTUAL;
+            }
+          else if (strcmp (argv[i], "relative") == 0)
+            {
+              intent = BABL_ICC_INTENT_RELATIVE_COLORIMETRIC;
+            }
+          else if (strcmp (argv[i], "saturation") == 0)
+            {
+              intent = BABL_ICC_INTENT_SATURATION;
+            }
+          else if (strcmp (argv[i], "absolute") == 0)
+            {
+              intent = BABL_ICC_INTENT_ABSOLUTE_COLORIMETRIC;
+            }
+          else
+            {
+              fprintf (stderr, "babl: unknown intent: %s\n", argv[i]);
+              fprintf (stderr, "valid intents: perceptual, relative, saturation, absolute.\n");
+              return 2;
+            }
+        }
+      else if (strcmp (argv[i], "--") == 0)
+        {
+          break;
+        }
+      else if (strcmp (argv[i], "--help") == 0 ||
+               strcmp (argv[i], "-h") == 0)
+        {
+          babl_cli_print_usage (stdout);
+
+          return 0;
+        }
+      else if (strcmp (argv[i], "--from") == 0 ||
+               strcmp (argv[i], "-f") == 0)
+        {
+          set_from = 1;
+        }
+      else if (strcmp (argv[i], "--to") == 0 ||
+               strcmp (argv[i], "-t") == 0)
+        {
+          set_to = 1;
+        }
+      else if (strcmp (argv[i], "--input-profile") == 0 ||
+               strcmp (argv[i], "-i") == 0)
+        {
+          set_from_profile = 1;
+        }
+      else if (strcmp (argv[i], "--output-profile") == 0 ||
+               strcmp (argv[i], "-o") == 0)
+        {
+          set_to_profile = 1;
+        }
+      else if (strcmp (argv[i], "--intent") == 0 ||
+               strcmp (argv[i], "-r") == 0)
+        {
+          set_intent = 1;
+        }
+      else if (strcmp (argv[i], "--brief") == 0 ||
+               strcmp (argv[i], "-b") == 0)
+        {
+          brief_output = 1;
+        }
+    }
+
+  if (from_profile != NULL)
+    {
+      from_space = babl_cli_get_space (from_profile, intent);
+
+      if (! from_space)
+        return 6;
+    }
+
+  if (to_profile != NULL)
+    {
+      to_space = babl_cli_get_space (to_profile, intent);
+
+      if (! to_space)
+        return 6;
+    }
+
+  if (from == NULL)
+    {
+      if (babl_space_is_cmyk (from_space))
+        from = "CMYK float";
+      else if (babl_space_is_gray (from_space))
+        from = "Y' float";
+      else
+        from = "R'G'B' float";
+    }
+  if (to == NULL)
+    {
+      if (babl_space_is_cmyk (to_space))
+        to = "CMYK float";
+      else if (babl_space_is_gray (to_space))
+        to = "Y' float";
+      else
+        to = "R'G'B' float";
+    }
+
+  from_format  = babl_format_with_space (from, from_space);
+  n_components = babl_format_get_n_components (from_format);
+  source       = malloc (babl_format_get_bytes_per_pixel (from_format));
+  data_index   = 0;
+
+  to_format    = babl_format_with_space (to, to_space);
+  dest         = malloc (babl_format_get_bytes_per_pixel (to_format));
+
+  /* Re-looping through arguments, to be more flexible with argument orders.
+   * In this second loop, we get the source components' values.
+   */
+  set_from = set_to = set_to_profile = set_from_profile = 0;
+  for (i = 1, c = 0; i < argc; i++)
+    {
+      if (set_from)
+        {
+          set_from = 0;
+          /* Pass. */
+        }
+      else if (set_to)
+        {
+          set_to = 0;
+          /* Pass. */
+        }
+      else if (set_from_profile)
+        {
+          set_from_profile = 0;
+          /* Pass. */
+        }
+      else if (set_to_profile)
+        {
+          set_to_profile = 0;
+          /* Pass. */
+        }
+      else if (set_intent)
+        {
+          set_intent = 0;
+          /* Pass. */
+        }
+      else if (! options_ended && strncmp (argv[i], "-", 1) == 0)
+        {
+          if (strcmp (argv[i], "--") == 0)
+            {
+              options_ended = 1;
+            }
+          else if (strcmp (argv[i], "--help") == 0 ||
+                   strcmp (argv[i], "-h") == 0)
+             {
+               /* Pass. */
+             }
+          else if (strcmp (argv[i], "--from") == 0 ||
+                   strcmp (argv[i], "-f") == 0)
+            {
+              set_from = 1;
+            }
+          else if (strcmp (argv[i], "--to") == 0 ||
+                   strcmp (argv[i], "-t") == 0)
+            {
+              set_to = 1;
+            }
+          else if (strcmp (argv[i], "--input-profile") == 0 ||
+                   strcmp (argv[i], "-i") == 0)
+            {
+              set_from_profile = 1;
+            }
+          else if (strcmp (argv[i], "--output-profile") == 0 ||
+                   strcmp (argv[i], "-o") == 0)
+            {
+              set_to_profile = 1;
+            }
+          else if (strcmp (argv[i], "--intent") == 0 ||
+                   strcmp (argv[i], "-r") == 0)
+            {
+              set_intent = 1;
+            }
+          else if (strcmp (argv[i], "--brief") == 0 ||
+                   strcmp (argv[i], "-b") == 0)
+            {
+              /* Pass. */
+            }
+          else
+            {
+              fprintf (stderr, "babl: unknown option: %s\n", argv[i]);
+              babl_cli_print_usage (stderr);
+              return 2;
+            }
+        }
+      else
+        {
+          const Babl *arg_type;
+          char       *endptr = NULL;
+
+          if (c >= n_components)
+            {
+              fprintf (stderr, "babl: unexpected component: %s\n", argv[i]);
+              babl_cli_print_usage (stderr);
+              return 2;
+            }
+
+          arg_type = babl_format_get_type (from_format, c);
+
+          if (strcmp (babl_get_name (arg_type), "float") == 0)
+            {
+              float  value = strtof (argv[i], &endptr);
+              float *fsrc = (float *) (source + data_index);
+
+              if (value == 0.0f && endptr == argv[i])
+                {
+                  fprintf (stderr, "babl: expected type of component %d is '%s', invalid value: %s\n",
+                           c, babl_get_name (arg_type), argv[i]);
+                  return 3;
+                }
+
+              *fsrc = value;
+              data_index += 4;
+            }
+          else if (strncmp (babl_get_name (arg_type), "u", 1) == 0)
+            {
+              long int value = strtol (argv[i], &endptr, 10);
+
+              if (value == 0 && endptr == argv[i])
+                {
+                  fprintf (stderr, "babl: expected type of component %d is '%s', invalid value: %s\n",
+                           c, babl_get_name (arg_type), argv[i]);
+                  return 3;
+                }
+
+              if (strcmp (babl_get_name (arg_type), "u8") == 0)
+                {
+                  uint8_t *usrc = (uint8_t *) (source + data_index);
+
+                  *usrc = value;
+                  data_index += 1;
+                }
+              else if (strcmp (babl_get_name (arg_type), "u16") == 0)
+                {
+                  uint16_t *usrc = (uint16_t *) (source + data_index);
+
+                  *usrc = value;
+                  data_index += 2;
+                }
+              else if (strcmp (babl_get_name (arg_type), "u32") == 0)
+                {
+                  uint32_t *usrc = (uint32_t *) (source + data_index);
+
+                  *usrc = value;
+                  data_index += 4;
+                }
+              else
+                {
+                  fprintf (stderr, "babl: unsupported unsigned type '%s' of component %d: %s\n",
+                           babl_get_name (arg_type), c, argv[i]);
+                  return 4;
+                }
+            }
+          else
+            {
+              fprintf (stderr, "babl: unsupported type '%s' of component %d: %s\n",
+                       babl_get_name (arg_type), c, argv[i]);
+              return 4;
+            }
+
+          c++;
+        }
+    }
+
+  if (c != n_components)
+    {
+      fprintf (stderr, "babl: %d components expected, %d components were passed\n",
+               n_components, c);
+      babl_cli_print_usage (stderr);
+      return 2;
+    }
+
+  /* Actual processing. */
+  fish = babl_fish (from_format, to_format);
+  babl_process (fish, source, dest, 1);
+
+  /* Now displaying the result. */
+  n_components = babl_format_get_n_components (to_format);
+  data_index   = 0;
+
+  if (! brief_output)
+    printf ("Converting from \"%s\" to \"%s\":\n",
+                    babl_get_name (from_format),
+                    babl_get_name (to_format));
+
+  for (c = 0; c < n_components; c++)
+    {
+      const Babl *arg_type = NULL;
+
+      arg_type = babl_format_get_type (to_format, c);
+
+      if (strcmp (babl_get_name (arg_type), "float") == 0)
+        {
+          float value = *((float *) (dest + data_index));
+
+          data_index += 4;
+
+          if (brief_output)
+            printf ("%s%f", c > 0 ? " ":"", value);
+          else
+            printf ("- %f\n", value);
+        }
+      else if (strcmp (babl_get_name (arg_type), "u8") == 0)
+        {
+          uint8_t value = *((uint8_t *) (dest + data_index));
+
+          data_index += 1;
+
+          if (brief_output)
+            printf ("%s%d", c > 0 ? " ":"", value);
+          else
+            printf ("- %d\n", value);
+        }
+      else if (strcmp (babl_get_name (arg_type), "u16") == 0)
+        {
+          uint16_t value = *((uint16_t *) (dest + data_index));
+
+          data_index += 2;
+
+          if (brief_output)
+            printf ("%s%d", c > 0 ? " ":"", value);
+          else
+            printf ("- %d\n", value);
+        }
+      else if (strcmp (babl_get_name (arg_type), "u32") == 0)
+        {
+          uint32_t value = *((uint32_t *) (dest + data_index));
+
+          data_index += 4;
+
+          if (brief_output)
+            printf ("%s%d", c > 0 ? " ":"", value);
+          else
+            printf ("- %d\n", value);
+        }
+      else
+        {
+          fprintf (stderr, "babl: unsupported type '%s' of returned component %d: %s\n",
+                   babl_get_name (arg_type), c, argv[i]);
+          return 5;
+        }
+    }
+
+  babl_exit ();
+
+  free (source);
+  free (dest);
+
+  return 0;
+}
+
+static const Babl *
+babl_cli_get_space (const char    *path,
+                    BablIccIntent  intent)
+{
+  const Babl *space;
+  FILE       *f;
+  char       *icc_data;
+  long        icc_length;
+  const char *error = NULL;
+
+  f = fopen (path, "r");
+
+  if (f == NULL)
+    {
+      fprintf (stderr, "babl: failed to open '%s': %s\n",
+               path, strerror (errno));
+      return NULL;
+    }
+
+  fseek (f, 0, SEEK_END);
+  icc_length = ftell (f);
+  fseek (f, 0, SEEK_SET);
+
+  icc_data = malloc (icc_length);
+  fread (icc_data, icc_length, 1, f);
+
+  fclose (f);
+
+  space = babl_space_from_icc (icc_data, icc_length, intent, &error);
+
+  if (space == NULL)
+    {
+      fprintf (stderr, "babl: failed to load space from '%s': %s\n",
+               path, error);
+      return NULL;
+    }
+
+  return space;
+}
+
+static void
+babl_cli_print_usage (FILE *stream)
+{
+  fprintf (stream,
+           "Usage: babl [options] [c1 ..]\n"
+           "Convert color data from a specific Babl format and space to another.\n"
+           "\n"
+           "  Options:\n"
+           "     -h, --help            this help information\n"
+           "\n"
+           "     -f, --from            input Babl format\n"
+           "\n"
+           "     -t, --to              output Babl format\n"
+           "\n"
+           "     -i, --input-profile   input profile\n"
+           "\n"
+           "     -o, --output-profile  output profile\n"
+           "\n"
+           "     -r, --intent          rendering intent\n"
+           "                           it only works with an output profile\n"
+           "\n"
+           "     -b, --brief           brief output\n"
+           "                           it can be re-entered as input for chain conversions\n"
+           "\n"
+           "All parameters following -- are considered components values. "
+           "This is useful to input negative components.\n\n"
+           "The tool expects exactly the number of components expected by your input format.\n\n"
+           "The default input and output formats are \"R'G'B' float\" and default space is "
+           "sRGB for RGB formats, or the naive CMYK space for CMYK formats.\n");
+}
diff --git a/bin/meson.build b/bin/meson.build
new file mode 100644
index 0000000..7aac948
--- /dev/null
+++ b/bin/meson.build
@@ -0,0 +1,10 @@
+babl_sources = files(
+  'babl.c',
+)
+
+babl_bin = executable('babl',
+  babl_sources,
+  include_directories: [ rootInclude ],
+  link_with: babl,
+  install: true,
+)
diff --git a/debian/changelog b/debian/changelog
index 132972c..84fea7b 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,70 @@
+babl (1:0.1.106-2) unstable; urgency=medium
+
+  * Release to unstable
+
+ -- Jeremy Bícha <jbicha@ubuntu.com>  Mon, 12 Jun 2023 13:06:10 -0400
+
+babl (1:0.1.106-1) experimental; urgency=medium
+
+  * New upstream release
+  * autopkgtest: Update for renamed pkgconfig file: babl-0.1 instead of babl
+  * debian/control.in: Set Rules-Requires-Root: no
+  * Update lintian override info format in d/libbabl-0.1-0.lintian-overrides
+  * Update standards version to 4.6.2, no changes needed
+
+ -- Jeremy Bícha <jbicha@ubuntu.com>  Wed, 24 May 2023 11:29:31 -0400
+
+babl (1:0.1.98-1) unstable; urgency=medium
+
+  * New upstream release
+  * Add debian/upstream/metadata
+  * debian/libbabl-0.1-0.symbols: Add new symbol
+  * Bump Standards-Version to 4.6.1
+
+ -- Jeremy Bicha <jbicha@ubuntu.com>  Fri, 18 Nov 2022 11:49:39 -0500
+
+babl (1:0.1.96-1) unstable; urgency=medium
+
+  * Team upload
+  * New upstream release
+  * Drop patch applied upstream
+  * Don't install newly introduced bin/babl yet
+
+ -- Nathan Pratta Teodosio <nathan.teodosio@canonical.com>  Thu, 25 Aug 2022 10:46:56 -0300
+
+babl (1:0.1.92-1) unstable; urgency=medium
+
+  * New upstream release
+  * Add patch to fix build with latest meson
+
+ -- Jeremy Bicha <jbicha@ubuntu.com>  Thu, 24 Mar 2022 15:21:20 -0400
+
+babl (1:0.1.90-1) unstable; urgency=medium
+
+  * New upstream release
+  * debian/rules: Drop unneeded -Wl,--as-needed
+  * debian/libbabl-0.1-0.symbols: Add new symbols
+
+ -- Jeremy Bicha <jeremy.bicha@canonical.com>  Tue, 22 Feb 2022 16:35:14 -0500
+
+babl (1:0.1.88-1) unstable; urgency=medium
+
+  * New upstream release
+
+ -- Jeremy Bicha <jbicha@debian.org>  Sun, 18 Jul 2021 12:57:04 -0400
+
+babl (1:0.1.86-1) unstable; urgency=medium
+
+  * New upstream release
+
+ -- Jeremy Bicha <jbicha@debian.org>  Fri, 12 Mar 2021 18:21:15 -0500
+
+babl (1:0.1.82-1) unstable; urgency=medium
+
+  * New upstream release
+
+ -- Jeremy Bicha <jbicha@debian.org>  Sun, 04 Oct 2020 21:46:57 -0400
+
 babl (1:0.1.80-1) unstable; urgency=medium
 
   * Team upload
diff --git a/debian/control b/debian/control
index 303076b..5b56afc 100644
--- a/debian/control
+++ b/debian/control
@@ -6,7 +6,7 @@ Source: babl
 Section: libs
 Priority: optional
 Maintainer: Debian GNOME Maintainers <pkg-gnome-maintainers@lists.alioth.debian.org>
-Uploaders: Jeremy Bicha <jbicha@debian.org>
+Uploaders: Jeremy Bicha <jbicha@ubuntu.com>
 Build-Depends: debhelper-compat (= 13),
                dh-sequence-gir,
                dh-sequence-gnome,
@@ -17,10 +17,11 @@ Build-Depends: debhelper-compat (= 13),
                pkg-config,
                valac (>= 0.20.0),
                w3m
-Standards-Version: 4.5.0
+Standards-Version: 4.6.2
+Rules-Requires-Root: no
 Vcs-Browser: https://salsa.debian.org/gnome-team/babl
 Vcs-Git: https://salsa.debian.org/gnome-team/babl.git
-Homepage: http://gegl.org/babl/
+Homepage: https://gegl.org/babl/
 
 Package: libbabl-0.1-0
 Architecture: any
diff --git a/debian/control.in b/debian/control.in
index c544fa7..11e58fb 100644
--- a/debian/control.in
+++ b/debian/control.in
@@ -13,10 +13,11 @@ Build-Depends: debhelper-compat (= 13),
                pkg-config,
                valac (>= 0.20.0),
                w3m
-Standards-Version: 4.5.0
+Standards-Version: 4.6.2
+Rules-Requires-Root: no
 Vcs-Browser: https://salsa.debian.org/gnome-team/babl
 Vcs-Git: https://salsa.debian.org/gnome-team/babl.git
-Homepage: http://gegl.org/babl/
+Homepage: https://gegl.org/babl/
 
 Package: libbabl-0.1-0
 Architecture: any
diff --git a/debian/libbabl-0.1-0.lintian-overrides b/debian/libbabl-0.1-0.lintian-overrides
index 95da48c..4c4f527 100644
--- a/debian/libbabl-0.1-0.lintian-overrides
+++ b/debian/libbabl-0.1-0.lintian-overrides
@@ -1,4 +1,4 @@
 # They're plugins, and many genuinely don't call libc functions
-libbabl-0.1-0: library-not-linked-against-libc usr/lib/x86_64-linux-gnu/babl-0.1/*
+libbabl-0.1-0: library-not-linked-against-libc [usr/lib/x86_64-linux-gnu/babl-0.1/*]
 # They're plugins, and some genuinely only do computation, not library calls
-libbabl-0.1-0: shared-library-lacks-prerequisites usr/lib/x86_64-linux-gnu/babl-0.1/*
+libbabl-0.1-0: shared-library-lacks-prerequisites [usr/lib/x86_64-linux-gnu/babl-0.1/*]
diff --git a/debian/libbabl-0.1-0.symbols b/debian/libbabl-0.1-0.symbols
index 435eb55..55f2d23 100644
--- a/debian/libbabl-0.1-0.symbols
+++ b/debian/libbabl-0.1-0.symbols
@@ -2,6 +2,7 @@ libbabl-0.1.so.0 libbabl-0.1-0 #MINVER#
 * Build-Depends-Package: libbabl-dev
  V0_1_0@V0_1_0 0.1.72
  babl_backtrack@V0_1_0 0.1.72
+ babl_chromatic_adaptation_matrix@V0_1_0 1:0.1.90
  babl_class_name@V0_1_0 0.1.72
  babl_component@V0_1_0 0.1.72
  babl_component_new@V0_1_0 0.1.72
@@ -21,6 +22,7 @@ libbabl-0.1.so.0 libbabl-0.1-0 #MINVER#
  babl_fast_fish@V0_1_0 0.1.72
  babl_fish@V0_1_0 0.1.72
  babl_fish_db@V0_1_0 0.1.72
+ babl_fish_get_process@V0_1_0 1:0.1.90
  babl_fish_path@V0_1_0 0.1.72
  babl_format@V0_1_0 0.1.72
  babl_format_class_for_each@V0_1_0 0.1.72
@@ -39,6 +41,7 @@ libbabl-0.1.so.0 libbabl-0.1-0 #MINVER#
  babl_format_with_space@V0_1_0 0.1.72
  babl_formats_count@V0_1_0 0.1.72
  babl_free@V0_1_0 0.1.72
+ babl_gc@V0_1_0 1:0.1.98
  babl_get_model_flags@V0_1_0 0.1.72
  babl_get_name@V0_1_0 0.1.72
  babl_get_user_data@V0_1_0 0.1.72
diff --git a/debian/patches/tests-increase-timeout.patch b/debian/patches/tests-increase-timeout.patch
index fa29ce1..5ba85d7 100644
--- a/debian/patches/tests-increase-timeout.patch
+++ b/debian/patches/tests-increase-timeout.patch
@@ -12,10 +12,10 @@ https://buildd.debian.org/status/logs.php?pkg=babl&arch=armel
  1 file changed, 1 insertion(+)
 
 diff --git a/tests/meson.build b/tests/meson.build
-index dfe7d45..c03e1a4 100644
+index eee8895..e131c14 100644
 --- a/tests/meson.build
 +++ b/tests/meson.build
-@@ -50,5 +50,6 @@ foreach test_name : test_names
+@@ -51,5 +51,6 @@ foreach test_name : test_names
      test,
      env: test_env,
      workdir: meson.current_build_dir(),
diff --git a/debian/rules b/debian/rules
index df3e876..37f43cc 100755
--- a/debian/rules
+++ b/debian/rules
@@ -1,7 +1,7 @@
 #!/usr/bin/make -f
 
 export DEB_BUILD_MAINT_OPTIONS = hardening=+all
-export DEB_LDFLAGS_MAINT_APPEND = -Wl,-O1 -Wl,-z,defs -Wl,--as-needed
+export DEB_LDFLAGS_MAINT_APPEND = -Wl,-O1 -Wl,-z,defs
 
 # Disable SSE2 except on amd64
 ifneq ($(DEB_HOST_ARCH_CPU),amd64)
@@ -19,3 +19,6 @@ override_dh_auto_configure:
 
 override_dh_makeshlibs:
 	dh_makeshlibs -X/usr/lib/$(DEB_HOST_MULTIARCH)/babl-0.1/ -- -c4
+
+override_dh_missing:
+	dh_missing -X/usr/bin
diff --git a/debian/tests/libbabl-dev b/debian/tests/libbabl-dev
index 13bb908..50a7c3a 100755
--- a/debian/tests/libbabl-dev
+++ b/debian/tests/libbabl-dev
@@ -33,5 +33,5 @@ EOF
 
 # Deliberately word-splitting pkg-config's output:
 # shellcheck disable=SC2046
-"${CROSS_COMPILE}gcc" -otrivial trivial.c $("${CROSS_COMPILE}pkg-config" --cflags --libs babl)
+"${CROSS_COMPILE}gcc" -otrivial trivial.c $("${CROSS_COMPILE}pkg-config" --cflags --libs babl-0.1)
 ./trivial
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
new file mode 100644
index 0000000..34cb9a6
--- /dev/null
+++ b/debian/upstream/metadata
@@ -0,0 +1,6 @@
+Archive: GNOME
+Bug-Database: https://gitlab.gnome.org/GNOME/babl/-/issues
+Bug-Submit: https://gitlab.gnome.org/GNOME/babl/-/issues/new
+Changelog: https://gitlab.gnome.org/GNOME/babl/commits/master
+Repository-Browse: https://gitlab.gnome.org/GNOME/babl
+Repository: https://gitlab.gnome.org/GNOME/babl.git
diff --git a/docs/build_as_meson_subproject.md b/docs/build_as_meson_subproject.md
new file mode 100644
index 0000000..d6e1c65
--- /dev/null
+++ b/docs/build_as_meson_subproject.md
@@ -0,0 +1,23 @@
+
+Including babl as a meson subproject in your project:
+
+You create a babl.wrap file inside a folder named 'subprojects'
+which contains a way to download the babl source.
+See https://mesonbuild.com/Wrap-dependency-system-manual.html.
+An example will be:
+```
+[wrap-git]
+url = https://gitlab.gnome.org/GNOME/babl
+revision = master
+depth = 1
+```
+
+Next, include in your meson.build file something like this:
+```
+babl = dependency('babl', fallback: ['babl', 'libbabl_dep'])
+```
+
+If babl is installed in your system, meson will use that one,
+otherwise it will download and build babl.
+
+
diff --git a/docs/meson.build b/docs/meson.build
index eefeb91..0201adc 100644
--- a/docs/meson.build
+++ b/docs/meson.build
@@ -1,12 +1,7 @@
 subdir('graphics')
 
-host    = 'pippin.gimp.org'
-location= 'public_html/babl'
-scptarget = host + ':' + location + '/'
-
-
 xml_insert = find_program(
-  meson.source_root() / 'tools' / 'xml-insert.py',
+  xml_insert_file,
   native: true
 )
 
@@ -46,9 +41,9 @@ endif
 TOC = files('toc')
 html_files = {
   'index': [index_static_html, [
-    ['AUTHORS', files(meson.source_root() / 'AUTHORS')],
-    ['TODO',    files(meson.source_root() / 'TODO')],
-    ['NEWS',    files(meson.source_root() / 'NEWS')],
+    ['AUTHORS', authors_file],
+    ['TODO',    todo_file],
+    ['NEWS',    news_file],
     ['TOC',     TOC],
   ]],
   'Reference': ['auto', [
@@ -100,12 +95,3 @@ foreach _file, _parms : html_files
     index_html = _tgt
   endif
 endforeach
-
-
-run_target('push_web',
-  command: [
-    'scp', index_html, index_static_html, babl_css, scptarget,
-    '&&',
-    'scp', graphic_files_install, scptarget + 'graphics/'
-  ],
-)
diff --git a/export-symbols b/export-symbols
index 82269f5..1e38600 100644
--- a/export-symbols
+++ b/export-symbols
@@ -3,6 +3,7 @@ babl_component_new
 babl_conversion_get_destination_space
 babl_conversion_get_source_space
 babl_conversion_new
+babl_chromatic_adaptation_matrix
 babl_cpu_accel_get_support
 babl_exit
 babl_fast_fish
@@ -71,12 +72,14 @@ babl_db_exist_by_id
 babl_db_each
 babl_formats_count
 babl_format_class_for_each
+babl_gc
 babl_model_class_for_each
 babl_type_class_for_each
 babl_conversion_class_for_each
 babl_set_extender
 babl_extension_quiet_log
 babl_fish_path
+babl_fish_get_process
 babl_extender
 babl_class_name
 babl_sanity
diff --git a/extensions/CIE.c b/extensions/CIE.c
index 1607b27..1d7b0ca 100644
--- a/extensions/CIE.c
+++ b/extensions/CIE.c
@@ -33,9 +33,13 @@
 
 #define DEGREES_PER_RADIAN (180 / 3.14159265358979323846)
 #define RADIANS_PER_DEGREE (1 / DEGREES_PER_RADIAN)
+#define DEGREES_PER_RADIANf (180 / 3.14159265358979323846f)
+#define RADIANS_PER_DEGREEf (1 / DEGREES_PER_RADIANf)
 
-#define LAB_EPSILON       (216.0f / 24389.0f)
-#define LAB_KAPPA         (24389.0f / 27.0f)
+#define LAB_EPSILON       (216.0 / 24389.0)
+#define LAB_EPSILONf      (216.0f / 24389.0f)
+#define LAB_KAPPA         (24389.0 / 27.0)
+#define LAB_KAPPAf        (24389.0f / 27.0f)
 
 /* The constants below hard-code the D50-adapted sRGB ICC profile
  * reference white, aka the ICC profile D50 illuminant.
@@ -52,12 +56,18 @@
  * hard-coded D50 ICC profile illuminant values:
  */
 
-#define D50_WHITE_REF_X   0.964202880f
-#define D50_WHITE_REF_Y   1.000000000f
-#define D50_WHITE_REF_Z   0.824905400f
+#define D50_WHITE_REF_X   0.964202880
+#define D50_WHITE_REF_Y   1.000000000
+#define D50_WHITE_REF_Z   0.824905400
 
-#define NEAR_ZERO         0.0000000001f
+#define D50_WHITE_REF_Xf  0.964202880f
+#define D50_WHITE_REF_Yf  1.000000000f
+#define D50_WHITE_REF_Zf  0.824905400f
+
+#define NEAR_ZERO         0.0000000001 
+#define NEAR_ZEROf        0.0000000001f
 #define near_zero(a)   ((a) < NEAR_ZERO && (a) > -NEAR_ZERO)
+#define near_zerof(a)  ((a) < NEAR_ZEROf && (a) > -NEAR_ZEROf)
 
 #define D50_WHITE_REF_x   0.345702921222f
 #define D50_WHITE_REF_y   0.358537532290f
@@ -70,10 +80,12 @@ static void conversions (void);
 static void formats (void);
 
 int init (void);
+#include "babl-verify-cpu.inc"
 
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
   types ();
   components ();
   models ();
@@ -591,15 +603,15 @@ rgbaf_to_xyYaf (const Babl *conversion,
                 long   samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
-  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
-  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
-  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
-  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
-  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
-  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_Xf;
+  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_Xf;
+  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
+  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Zf;
+  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Zf;
+  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -610,7 +622,7 @@ rgbaf_to_xyYaf (const Babl *conversion,
       b = src[2];
       a = src[3];
 
-      if (near_zero(r) && near_zero(g) && near_zero(b))
+      if (near_zerof(r) && near_zerof(g) && near_zerof(b))
         {
           Y = 0.0f;
           x = D50_WHITE_REF_x;
@@ -642,15 +654,15 @@ rgbf_to_xyYf (const Babl *conversion,float *src,
               long   samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
-  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
-  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
-  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
-  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
-  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
-  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_Xf;
+  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_Xf;
+  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
+  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Zf;
+  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Zf;
+  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -660,7 +672,7 @@ rgbf_to_xyYf (const Babl *conversion,float *src,
       g = src[1];
       b = src[2];
 
-      if (near_zero(r) && near_zero(g) && near_zero(b))
+      if (near_zerof(r) && near_zerof(g) && near_zerof(b))
         {
           Y = 0.0f;
           x = D50_WHITE_REF_x;
@@ -693,15 +705,15 @@ rgbaf_to_xyYf (const Babl *conversion,
                long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
-  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
-  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
-  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
-  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
-  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
-  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_Xf;
+  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_Xf;
+  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
+  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Zf;
+  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Zf;
+  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -711,7 +723,7 @@ rgbaf_to_xyYf (const Babl *conversion,
       g = src[1];
       b = src[2];
 
-      if (near_zero(r) && near_zero(g) && near_zero(b))
+      if (near_zerof(r) && near_zerof(g) && near_zerof(b))
         {
           Y = 0.0f;
           x = D50_WHITE_REF_x;
@@ -746,15 +758,15 @@ rgbaf_to_Yuvaf (const Babl *conversion,
                 long   samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
-  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
-  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
-  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
-  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
-  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
-  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_Xf;
+  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_Xf;
+  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
+  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Zf;
+  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Zf;
+  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -765,11 +777,11 @@ rgbaf_to_Yuvaf (const Babl *conversion,
       b = src[2];
       a = src[3];
 
-      if (near_zero(r) && near_zero(g) && near_zero(b))
+      if (near_zerof(r) && near_zerof(g) && near_zerof(b))
         {
           Y = 0.0f;
-		  u = 4.0/19.0;
-		  v = 9.0/19.0;
+          u = 4.0f/19.0f;
+          v = 9.0f/19.0f;
         }
       else
         {
@@ -777,9 +789,9 @@ rgbaf_to_Yuvaf (const Babl *conversion,
           Y = m_1_0 * r + m_1_1 * g + m_1_2 * b;
           Z = m_2_0 * r + m_2_1 * g + m_2_2 * b;
 
-	      sum = (X + 15.0 * Y + 3.0 * Z);
-	      u = (4.0 * X) / sum;
-	      v = (9.0 * Y) / sum;
+          sum = (X + 15.0f * Y + 3.0f * Z);
+	  u = (4.0f * X) / sum;
+	  v = (9.0f * Y) / sum;
         }
 
       dst[0] = Y;
@@ -798,15 +810,15 @@ rgbf_to_Yuvf (const Babl *conversion,float *src,
               long   samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
-  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
-  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
-  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
-  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
-  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
-  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_Xf;
+  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_Xf;
+  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
+  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Zf;
+  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Zf;
+  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -816,7 +828,7 @@ rgbf_to_Yuvf (const Babl *conversion,float *src,
       g = src[1];
       b = src[2];
 
-      if (near_zero(r) && near_zero(g) && near_zero(b))
+      if (near_zerof(r) && near_zerof(g) && near_zerof(b))
         {
           Y = 0.0f;
 		  u = 4.0/19.0;
@@ -828,9 +840,9 @@ rgbf_to_Yuvf (const Babl *conversion,float *src,
           Y = m_1_0 * r + m_1_1 * g + m_1_2 * b;
           Z = m_2_0 * r + m_2_1 * g + m_2_2 * b;
 
-	      sum = (X + 15.0 * Y + 3.0 * Z);
-	      u = (4.0 * X) / sum;
-	      v = (9.0 * Y) / sum;
+	      sum = (X + 15.0f * Y + 3.0f * Z);
+	      u = (4.0f * X) / sum;
+	      v = (9.0f * Y) / sum;
         }
 
       dst[0] = Y;
@@ -850,15 +862,15 @@ rgbaf_to_Yuvf (const Babl *conversion,
                long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
-  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
-  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
-  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
-  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
-  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
-  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_Xf;
+  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_Xf;
+  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
+  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Zf;
+  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Zf;
+  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -868,11 +880,11 @@ rgbaf_to_Yuvf (const Babl *conversion,
       g = src[1];
       b = src[2];
 
-      if (near_zero(r) && near_zero(g) && near_zero(b))
+      if (near_zerof(r) && near_zerof(g) && near_zerof(b))
         {
           Y = 0.0f;
-		  u = 4.0/19.0;
-		  v = 9.0/19.0;
+          u = 4.0f/19.0f;
+          v = 9.0f/19.0f;
         }
       else
         {
@@ -880,9 +892,9 @@ rgbaf_to_Yuvf (const Babl *conversion,
           Y = m_1_0 * r + m_1_1 * g + m_1_2 * b;
           Z = m_2_0 * r + m_2_1 * g + m_2_2 * b;
 
-	      sum = (X + 15.0 * Y + 3.0 * Z);
-	      u = (4.0 * X) / sum;
-	      v = (9.0 * Y) / sum;
+	      sum = (X + 15.0f * Y + 3.0f * Z);
+	      u = (4.0f * X) / sum;
+	      v = (9.0f * Y) / sum;
         }
 
       dst[0] = Y;
@@ -1049,15 +1061,15 @@ xyYf_to_rgbf (const Babl *conversion,float *src,
                 long   samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_X;
-  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Y;
-  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Z;
-  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_X;
-  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Y;
-  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Z;
-  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_X;
-  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Y;
-  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Z;
+  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Yf;
+  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Zf;
+  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_Xf;
+  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Zf;
+  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_Xf;
+  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Yf;
+  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -1067,7 +1079,7 @@ xyYf_to_rgbf (const Babl *conversion,float *src,
       float y = src[1];
       float Y = src[2];
 
-      if (near_zero (y))
+      if (near_zerof (y))
         {
           X = 0.0f;
           Y = 0.0f;
@@ -1102,15 +1114,15 @@ xyYf_to_rgbaf (const Babl *conversion,
                long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_X;
-  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Y;
-  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Z;
-  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_X;
-  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Y;
-  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Z;
-  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_X;
-  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Y;
-  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Z;
+  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Yf;
+  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Zf;
+  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_Xf;
+  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Zf;
+  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_Xf;
+  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Yf;
+  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -1121,7 +1133,7 @@ xyYf_to_rgbaf (const Babl *conversion,
       float Y = src[2];
 
 
-      if (near_zero (Y))
+      if (near_zerof (Y))
         {
           X = 0.0f;
           Y = 0.0f;
@@ -1155,15 +1167,15 @@ xyYaf_to_rgbaf (const Babl *conversion,
                 long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_X;
-  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Y;
-  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Z;
-  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_X;
-  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Y;
-  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Z;
-  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_X;
-  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Y;
-  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Z;
+  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Yf;
+  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Zf;
+  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_Xf;
+  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Zf;
+  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_Xf;
+  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Yf;
+  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -1174,7 +1186,7 @@ xyYaf_to_rgbaf (const Babl *conversion,
       float Y = src[2];
       float a = src[3];
 
-      if (near_zero (Y))
+      if (near_zerof (Y))
         {
           X = 0.0f;
           Y = 0.0f;
@@ -1211,15 +1223,15 @@ Yuvf_to_rgbf (const Babl *conversion,float *src,
               long   samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_X;
-  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Y;
-  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Z;
-  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_X;
-  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Y;
-  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Z;
-  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_X;
-  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Y;
-  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Z;
+  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Yf;
+  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Zf;
+  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_Xf;
+  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Zf;
+  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_Xf;
+  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Yf;
+  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -1229,7 +1241,7 @@ Yuvf_to_rgbf (const Babl *conversion,float *src,
       float u = src[1];
       float v = src[2];
 
-      if (near_zero (v))
+      if (near_zerof (v))
         {
           X = 0.0f;
           Y = 0.0f;
@@ -1237,9 +1249,9 @@ Yuvf_to_rgbf (const Babl *conversion,float *src,
         }
       else
         {
-          X = ((9.0 * u * Y)/(4.0 * v));
+          X = ((9.0f * u * Y)/(4.0f * v));
           //Y = Y;
-          Z = -(((20.0 * v + 3.0 * u - 12.0) * Y)/(4.0 * v));
+          Z = -(((20.0f * v + 3.0f * u - 12.0f) * Y)/(4.0f * v));
         }
 
       r = m_0_0 * X + m_0_1 * Y + m_0_2 * Z;
@@ -1264,15 +1276,15 @@ Yuvf_to_rgbaf (const Babl *conversion,
                long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_X;
-  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Y;
-  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Z;
-  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_X;
-  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Y;
-  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Z;
-  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_X;
-  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Y;
-  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Z;
+  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Yf;
+  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Zf;
+  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_Xf;
+  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Zf;
+  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_Xf;
+  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Yf;
+  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -1282,7 +1294,7 @@ Yuvf_to_rgbaf (const Babl *conversion,
       float u = src[1];
       float v = src[2];
 
-      if (near_zero (v))
+      if (near_zerof (v))
         {
           X = 0.0f;
           Y = 0.0f;
@@ -1290,9 +1302,9 @@ Yuvf_to_rgbaf (const Babl *conversion,
         }
       else
         {
-          X = ((9.0 * u * Y)/(4.0 * v));
+          X = ((9.0f * u * Y)/(4.0f * v));
           //Y = Y;
-          Z = -(((20.0 * v + 3.0 * u - 12.0) * Y)/(4.0 * v));
+          Z = -(((20.0f * v + 3.0f * u - 12.0f) * Y)/(4.0f * v));
         }
 
       r = m_0_0 * X + m_0_1 * Y + m_0_2 * Z;
@@ -1316,15 +1328,15 @@ Yuvaf_to_rgbaf (const Babl *conversion,
                 long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_X;
-  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Y;
-  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Z;
-  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_X;
-  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Y;
-  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Z;
-  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_X;
-  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Y;
-  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Z;
+  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Yf;
+  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Zf;
+  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_Xf;
+  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Zf;
+  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_Xf;
+  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Yf;
+  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -1335,7 +1347,7 @@ Yuvaf_to_rgbaf (const Babl *conversion,
       float v = src[2];
       float a = src[3];
 
-      if (near_zero (v))
+      if (near_zerof (v))
         {
           X = 0.0f;
           Y = 0.0f;
@@ -1343,9 +1355,9 @@ Yuvaf_to_rgbaf (const Babl *conversion,
         }
       else
         {
-          X = ((9.0 * u * Y)/(4.0 * v));
+          X = ((9.0f * u * Y)/(4.0f * v));
           //Y = Y;
-          Z = -(((20.0 * v + 3.0 * u - 12.0) * Y)/(4.0 * v));
+          Z = -(((20.0f * v + 3.0f * u - 12.0f) * Y)/(4.0f * v));
         }
 
       r = m_0_0 * X + m_0_1 * Y + m_0_2 * Z;
@@ -1687,7 +1699,7 @@ Yf_to_Lf (const Babl *conversion,
   while (n--)
     {
       float yr = src[0];
-      float L  = yr > LAB_EPSILON ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPA * yr;
+      float L  = yr > LAB_EPSILONf ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPAf * yr;
 
       dst[0] = L;
 
@@ -1707,7 +1719,7 @@ Yaf_to_Lf (const Babl *conversion,
   while (n--)
     {
       float yr = src[0];
-      float L  = yr > LAB_EPSILON ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPA * yr;
+      float L  = yr > LAB_EPSILONf ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPAf * yr;
 
       dst[0] = L;
 
@@ -1728,7 +1740,7 @@ Yaf_to_Laf (const Babl *conversion,
     {
       float yr = src[0];
       float a  = src[1];
-      float L  = yr > LAB_EPSILON ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPA * yr;
+      float L  = yr > LAB_EPSILONf ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPAf * yr;
 
       dst[0] = L;
       dst[1] = a;
@@ -1745,15 +1757,15 @@ rgbf_to_Labf (const Babl *conversion,
               long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
-  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
-  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
-  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
-  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
-  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
-  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_Xf;
+  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_Xf;
+  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
+  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Zf;
+  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Zf;
+  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -1766,9 +1778,9 @@ rgbf_to_Labf (const Babl *conversion,
       float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
       float zr = m_2_0 * r + m_2_1 * g + m_2_2 * b;
 
-      float fx = xr > LAB_EPSILON ? _cbrtf (xr) : (LAB_KAPPA * xr + 16.0f) / 116.0f;
-      float fy = yr > LAB_EPSILON ? _cbrtf (yr) : (LAB_KAPPA * yr + 16.0f) / 116.0f;
-      float fz = zr > LAB_EPSILON ? _cbrtf (zr) : (LAB_KAPPA * zr + 16.0f) / 116.0f;
+      float fx = xr > LAB_EPSILONf ? _cbrtf (xr) : (LAB_KAPPAf * xr + 16.0f) / 116.0f;
+      float fy = yr > LAB_EPSILONf ? _cbrtf (yr) : (LAB_KAPPAf * yr + 16.0f) / 116.0f;
+      float fz = zr > LAB_EPSILONf ? _cbrtf (zr) : (LAB_KAPPAf * zr + 16.0f) / 116.0f;
 
       float L = 116.0f * fy - 16.0f;
       float A = 500.0f * (fx - fy);
@@ -1790,9 +1802,9 @@ rgbaf_to_Lf (const Babl *conversion,
              long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
+  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
   long n = samples;
 
   while (n--)
@@ -1802,7 +1814,7 @@ rgbaf_to_Lf (const Babl *conversion,
       float b = src[2];
 
       float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
-      float L = yr > LAB_EPSILON ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPA * yr;
+      float L = yr > LAB_EPSILONf ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPAf * yr;
 
       dst[0] = L;
 
@@ -1818,15 +1830,15 @@ rgbaf_to_Labf (const Babl *conversion,
                long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
-  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
-  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
-  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
-  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
-  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
-  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_Xf;
+  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_Xf;
+  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
+  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Zf;
+  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Zf;
+  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -1839,9 +1851,9 @@ rgbaf_to_Labf (const Babl *conversion,
       float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
       float zr = m_2_0 * r + m_2_1 * g + m_2_2 * b;
 
-      float fx = xr > LAB_EPSILON ? _cbrtf (xr) : (LAB_KAPPA * xr + 16.0f) / 116.0f;
-      float fy = yr > LAB_EPSILON ? _cbrtf (yr) : (LAB_KAPPA * yr + 16.0f) / 116.0f;
-      float fz = zr > LAB_EPSILON ? _cbrtf (zr) : (LAB_KAPPA * zr + 16.0f) / 116.0f;
+      float fx = xr > LAB_EPSILONf ? _cbrtf (xr) : (LAB_KAPPAf * xr + 16.0f) / 116.0f;
+      float fy = yr > LAB_EPSILONf ? _cbrtf (yr) : (LAB_KAPPAf * yr + 16.0f) / 116.0f;
+      float fz = zr > LAB_EPSILONf ? _cbrtf (zr) : (LAB_KAPPAf * zr + 16.0f) / 116.0f;
 
       float L = 116.0f * fy - 16.0f;
       float A = 500.0f * (fx - fy);
@@ -1863,15 +1875,15 @@ rgbaf_to_Labaf (const Babl *conversion,
                 long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
-  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
-  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
-  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
-  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
-  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
-  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_Xf;
+  float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_Xf;
+  float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
+  float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Zf;
+  float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Zf;
+  float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -1885,9 +1897,9 @@ rgbaf_to_Labaf (const Babl *conversion,
       float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
       float zr = m_2_0 * r + m_2_1 * g + m_2_2 * b;
 
-      float fx = xr > LAB_EPSILON ? _cbrtf (xr) : (LAB_KAPPA * xr + 16.0f) / 116.0f;
-      float fy = yr > LAB_EPSILON ? _cbrtf (yr) : (LAB_KAPPA * yr + 16.0f) / 116.0f;
-      float fz = zr > LAB_EPSILON ? _cbrtf (zr) : (LAB_KAPPA * zr + 16.0f) / 116.0f;
+      float fx = xr > LAB_EPSILONf ? _cbrtf (xr) : (LAB_KAPPAf * xr + 16.0f) / 116.0f;
+      float fy = yr > LAB_EPSILONf ? _cbrtf (yr) : (LAB_KAPPAf * yr + 16.0f) / 116.0f;
+      float fz = zr > LAB_EPSILONf ? _cbrtf (zr) : (LAB_KAPPAf * zr + 16.0f) / 116.0f;
 
       float L = 116.0f * fy - 16.0f;
       float A = 500.0f * (fx - fy);
@@ -1944,15 +1956,15 @@ Labf_to_rgbf (const Babl *conversion,
               long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_X;
-  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Y;
-  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Z;
-  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_X;
-  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Y;
-  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Z;
-  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_X;
-  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Y;
-  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Z;
+  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Yf;
+  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Zf;
+  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_Xf;
+  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Zf;
+  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_Xf;
+  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Yf;
+  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -1965,9 +1977,9 @@ Labf_to_rgbf (const Babl *conversion,
       float fx = fy + A / 500.0f;
       float fz = fy - B / 200.0f;
 
-      float yr = L > LAB_KAPPA * LAB_EPSILON ? cubef (fy) : L / LAB_KAPPA;
-      float xr = cubef (fx) > LAB_EPSILON ? cubef (fx) : (fx * 116.0f - 16.0f) / LAB_KAPPA;
-      float zr = cubef (fz) > LAB_EPSILON ? cubef (fz) : (fz * 116.0f - 16.0f) / LAB_KAPPA;
+      float yr = L > LAB_KAPPAf * LAB_EPSILONf ? cubef (fy) : L / LAB_KAPPAf;
+      float xr = cubef (fx) > LAB_EPSILONf ? cubef (fx) : (fx * 116.0f - 16.0f) / LAB_KAPPAf;
+      float zr = cubef (fz) > LAB_EPSILONf ? cubef (fz) : (fz * 116.0f - 16.0f) / LAB_KAPPAf;
 
       float r = m_0_0 * xr + m_0_1 * yr + m_0_2 * zr;
       float g = m_1_0 * xr + m_1_1 * yr + m_1_2 * zr;
@@ -1989,15 +2001,15 @@ Labf_to_rgbaf (const Babl *conversion,float *src,
                long   samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_X;
-  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Y;
-  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Z;
-  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_X;
-  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Y;
-  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Z;
-  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_X;
-  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Y;
-  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Z;
+  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Yf;
+  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Zf;
+  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_Xf;
+  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Zf;
+  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_Xf;
+  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Yf;
+  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -2010,9 +2022,9 @@ Labf_to_rgbaf (const Babl *conversion,float *src,
       float fx = fy + A / 500.0f;
       float fz = fy - B / 200.0f;
 
-      float yr = L > LAB_KAPPA * LAB_EPSILON ? cubef (fy) : L / LAB_KAPPA;
-      float xr = cubef (fx) > LAB_EPSILON ? cubef (fx) : (fx * 116.0f - 16.0f) / LAB_KAPPA;
-      float zr = cubef (fz) > LAB_EPSILON ? cubef (fz) : (fz * 116.0f - 16.0f) / LAB_KAPPA;
+      float yr = L > LAB_KAPPAf * LAB_EPSILONf ? cubef (fy) : L / LAB_KAPPAf;
+      float xr = cubef (fx) > LAB_EPSILONf ? cubef (fx) : (fx * 116.0f - 16.0f) / LAB_KAPPAf;
+      float zr = cubef (fz) > LAB_EPSILONf ? cubef (fz) : (fz * 116.0f - 16.0f) / LAB_KAPPAf;
 
       float r = m_0_0 * xr + m_0_1 * yr + m_0_2 * zr;
       float g = m_1_0 * xr + m_1_1 * yr + m_1_2 * zr;
@@ -2035,15 +2047,15 @@ Labaf_to_rgbaf (const Babl *conversion,
                 long        samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_X;
-  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Y;
-  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Z;
-  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_X;
-  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Y;
-  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Z;
-  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_X;
-  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Y;
-  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Z;
+  float m_0_0 = space->space.XYZtoRGBf[0] * D50_WHITE_REF_Xf;
+  float m_0_1 = space->space.XYZtoRGBf[1] * D50_WHITE_REF_Yf;
+  float m_0_2 = space->space.XYZtoRGBf[2] * D50_WHITE_REF_Zf;
+  float m_1_0 = space->space.XYZtoRGBf[3] * D50_WHITE_REF_Xf;
+  float m_1_1 = space->space.XYZtoRGBf[4] * D50_WHITE_REF_Yf;
+  float m_1_2 = space->space.XYZtoRGBf[5] * D50_WHITE_REF_Zf;
+  float m_2_0 = space->space.XYZtoRGBf[6] * D50_WHITE_REF_Xf;
+  float m_2_1 = space->space.XYZtoRGBf[7] * D50_WHITE_REF_Yf;
+  float m_2_2 = space->space.XYZtoRGBf[8] * D50_WHITE_REF_Zf;
   long n = samples;
 
   while (n--)
@@ -2057,9 +2069,9 @@ Labaf_to_rgbaf (const Babl *conversion,
       float fx = fy + A / 500.0f;
       float fz = fy - B / 200.0f;
 
-      float yr = L > LAB_KAPPA * LAB_EPSILON ? cubef (fy) : L / LAB_KAPPA;
-      float xr = cubef (fx) > LAB_EPSILON ? cubef (fx) : (fx * 116.0f - 16.0f) / LAB_KAPPA;
-      float zr = cubef (fz) > LAB_EPSILON ? cubef (fz) : (fz * 116.0f - 16.0f) / LAB_KAPPA;
+      float yr = L > LAB_KAPPAf * LAB_EPSILONf ? cubef (fy) : L / LAB_KAPPAf;
+      float xr = cubef (fx) > LAB_EPSILONf ? cubef (fx) : (fx * 116.0f - 16.0f) / LAB_KAPPAf;
+      float zr = cubef (fz) > LAB_EPSILONf ? cubef (fz) : (fz * 116.0f - 16.0f) / LAB_KAPPAf;
 
       float r = m_0_0 * xr + m_0_1 * yr + m_0_2 * zr;
       float g = m_1_0 * xr + m_1_1 * yr + m_1_2 * zr;
@@ -2090,7 +2102,7 @@ Labf_to_Lchabf (const Babl *conversion,
       float B = src[2];
 
       float C = sqrtf (A * A + B * B);
-      float H = atan2f (B, A) * DEGREES_PER_RADIAN;
+      float H = atan2f (B, A) * DEGREES_PER_RADIANf;
 
       // Keep H within the range 0-360
       if (H < 0.0f)
@@ -2119,8 +2131,8 @@ Lchabf_to_Labf (const Babl *conversion,
       float C = src[1];
       float H = src[2];
 
-      float A = C * cosf (H * RADIANS_PER_DEGREE);
-      float B = C * sinf (H * RADIANS_PER_DEGREE);
+      float A = C * cosf (H * RADIANS_PER_DEGREEf);
+      float B = C * sinf (H * RADIANS_PER_DEGREEf);
 
       dst[0] = L;
       dst[1] = A;
@@ -2147,7 +2159,7 @@ Labaf_to_Lchabaf (const Babl *conversion,
       float a = src[3];
 
       float C = sqrtf (A * A + B * B);
-      float H = atan2f (B, A) * DEGREES_PER_RADIAN;
+      float H = atan2f (B, A) * DEGREES_PER_RADIANf;
 
       // Keep H within the range 0-360
       if (H < 0.0f)
@@ -2178,8 +2190,8 @@ Lchabaf_to_Labaf (const Babl *conversion,
       float H = src[2];
       float a = src[3];
 
-      float A = C * cosf (H * RADIANS_PER_DEGREE);
-      float B = C * sinf (H * RADIANS_PER_DEGREE);
+      float A = C * cosf (H * RADIANS_PER_DEGREEf);
+      float B = C * sinf (H * RADIANS_PER_DEGREEf);
 
       dst[0] = L;
       dst[1] = A;
@@ -2309,7 +2321,7 @@ Yf_to_Lf_sse2 (const Babl  *conversion,
   while (remainder--)
     {
       float yr = src[0];
-      float L  = yr > LAB_EPSILON ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPA * yr;
+      float L  = yr > LAB_EPSILONf ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPAf * yr;
 
       dst[0] = L;
 
@@ -2353,7 +2365,7 @@ Yaf_to_Lf_sse2 (const Babl  *conversion,
   while (remainder--)
     {
       float yr = src[0];
-      float L  = yr > LAB_EPSILON ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPA * yr;
+      float L  = yr > LAB_EPSILONf ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPAf * yr;
 
       dst[0] = L;
 
@@ -2369,9 +2381,9 @@ rgbaf_to_Lf_sse2 (const Babl  *conversion,
                   long         samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
+  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
   long i = 0;
   long remainder;
 
@@ -2419,7 +2431,7 @@ rgbaf_to_Lf_sse2 (const Babl  *conversion,
       float b = src[2];
 
       float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
-      float L = yr > LAB_EPSILON ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPA * yr;
+      float L = yr > LAB_EPSILONf ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPAf * yr;
 
       dst[0] = L;
 
@@ -2435,15 +2447,15 @@ rgbaf_to_Labaf_sse2 (const Babl  *conversion,
                      long         samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
-  const float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
-  const float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
-  const float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
-  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
-  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
-  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
-  const float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
-  const float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
-  const float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  const float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_Xf;
+  const float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_Xf;
+  const float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_Xf;
+  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Yf;
+  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Yf;
+  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Yf;
+  const float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Zf;
+  const float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Zf;
+  const float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Zf;
   long i = 0;
   long remainder;
 
@@ -2523,9 +2535,9 @@ rgbaf_to_Labaf_sse2 (const Babl  *conversion,
       float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
       float zr = m_2_0 * r + m_2_1 * g + m_2_2 * b;
 
-      float fx = xr > LAB_EPSILON ? _cbrtf (xr) : (LAB_KAPPA * xr + 16.0f) / 116.0f;
-      float fy = yr > LAB_EPSILON ? _cbrtf (yr) : (LAB_KAPPA * yr + 16.0f) / 116.0f;
-      float fz = zr > LAB_EPSILON ? _cbrtf (zr) : (LAB_KAPPA * zr + 16.0f) / 116.0f;
+      float fx = xr > LAB_EPSILONf ? _cbrtf (xr) : (LAB_KAPPAf * xr + 16.0f) / 116.0f;
+      float fy = yr > LAB_EPSILONf ? _cbrtf (yr) : (LAB_KAPPAf * yr + 16.0f) / 116.0f;
+      float fz = zr > LAB_EPSILONf ? _cbrtf (zr) : (LAB_KAPPAf * zr + 16.0f) / 116.0f;
 
       float L = 116.0f * fy - 16.0f;
       float A = 500.0f * (fx - fy);
@@ -2975,6 +2987,20 @@ formats (void)
     NULL);
 
   babl_format_new (
+    "name", "CIE Lab alpha u8",
+    babl_model ("CIE Lab alpha"),
+
+    babl_type ("CIE u8 L"),
+    babl_component ("CIE L"),
+    babl_type ("CIE u8 ab"),
+    babl_component ("CIE a"),
+    babl_type ("CIE u8 ab"),
+    babl_component ("CIE b"),
+    babl_type ("u8"),
+    babl_component ("A"),
+    NULL);
+
+  babl_format_new (
     "name", "CIE Lab u16",
     babl_model ("CIE Lab"),
 
@@ -2987,6 +3013,20 @@ formats (void)
     NULL);
 
   babl_format_new (
+    "name", "CIE Lab alpha u16",
+    babl_model ("CIE Lab alpha"),
+
+    babl_type ("CIE u16 L"),
+    babl_component ("CIE L"),
+    babl_type ("CIE u16 ab"),
+    babl_component ("CIE a"),
+    babl_type ("CIE u16 ab"),
+    babl_component ("CIE b"),
+    babl_type ("u16"),
+    babl_component ("A"),
+    NULL);
+
+  babl_format_new (
     "name", "CIE xyY float",
     babl_model ("CIE xyY"),
 
diff --git a/extensions/HSL.c b/extensions/HSL.c
index bf48f34..73b2f03 100644
--- a/extensions/HSL.c
+++ b/extensions/HSL.c
@@ -63,7 +63,6 @@ hue2cpn  (double  p,
           
 int init (void);
 
-
 int
 init (void)
 {
diff --git a/extensions/babl-verify-cpu.inc b/extensions/babl-verify-cpu.inc
new file mode 100644
index 0000000..d656445
--- /dev/null
+++ b/extensions/babl-verify-cpu.inc
@@ -0,0 +1,63 @@
+#include "babl-cpuaccel.h"
+
+#ifdef X86_64_V2
+#define BABL_SIMD_x86_64_v2
+#define BABL_SIMD_SUFFIX(symbol) symbol##_x86_64_v2
+#else
+#ifdef X86_64_V3
+#define BABL_SIMD_x86_64_v3
+#define BABL_SIMD_SUFFIX(symbol) symbol##_x86_64_v3
+#else
+#define BABL_SIMD_generic
+#define BABL_SIMD_SUFFIX(symbol) symbol##_generic
+#endif
+#endif
+
+#define BABL_VERIFY_CPU()  do{}while(0)
+
+
+#ifdef BABL_SIMDFREE
+
+#ifdef ARCH_X86_64
+  
+#undef BABL_VERIFY_CPU
+#define BABL_VERIFY_CPU()\
+  if ((babl_cpu_accel_get_support() & BABL_CPU_ACCEL_X86_64_V2)\
+                                       == BABL_CPU_ACCEL_X86_64_V2) return 0;\
+  if ((babl_cpu_accel_get_support() & BABL_CPU_ACCEL_X86_64_V3)\
+                                       == BABL_CPU_ACCEL_X86_64_V3) return 0;
+
+#endif
+
+#ifdef ARCH_ARM
+#undef BABL_VERIFY_CPU
+#define BABL_VERIFY_CPU()\
+  if ((babl_cpu_accel_get_support() & BABL_CPU_ACCEL_ARM_NEON)\
+                                       == BABL_CPU_ACCEL_ARM_NEON) return 0;
+
+#endif
+#endif
+
+#ifdef X86_64_V2
+#undef BABL_VERIFY_CPU
+#define BABL_VERIFY_CPU()\
+  if ((babl_cpu_accel_get_support() & BABL_CPU_ACCEL_X86_64_V2)\
+                                       != BABL_CPU_ACCEL_X86_64_V2) return 0;\
+  if ((babl_cpu_accel_get_support() & BABL_CPU_ACCEL_X86_64_V3)\
+                                       == BABL_CPU_ACCEL_X86_64_V3) return 0;
+#endif
+
+#ifdef X86_64_V3
+#undef BABL_VERIFY_CPU
+#define BABL_VERIFY_CPU()\
+  if ((babl_cpu_accel_get_support() & BABL_CPU_ACCEL_X86_64_V3)\
+                                       != BABL_CPU_ACCEL_X86_64_V3) return 0;
+#endif
+
+
+#ifdef ARM_NEON
+#undef BABL_VERIFY_CPU
+#define BABL_VERIFY_CPU()\
+  if ((babl_cpu_accel_get_support() & BABL_CPU_ACCEL_ARM_NEON)\
+                                       != BABL_CPU_ACCEL_ARM_NEON) return 0;
+#endif
diff --git a/extensions/cairo.c b/extensions/cairo.c
index 08ccf67..a22eecd 100644
--- a/extensions/cairo.c
+++ b/extensions/cairo.c
@@ -28,8 +28,8 @@ int init (void);
 
 static void
 conv_rgba8_cairo24_le (const Babl    *conversion,
-                       unsigned char *src, 
-                       unsigned char *dst, 
+                       unsigned char *__restrict__ src, 
+                       unsigned char *__restrict__ dst, 
                        long           samples)
 {
   long n = samples;
@@ -49,8 +49,8 @@ conv_rgba8_cairo24_le (const Babl    *conversion,
 
 static void
 conv_rgb8_cairo24_le (const Babl    *conversion,
-                      unsigned char *src, 
-                      unsigned char *dst, 
+                      unsigned char *__restrict__ src, 
+                      unsigned char *__restrict__ dst, 
                       long           samples)
 {
   long n = samples;
@@ -71,8 +71,8 @@ conv_rgb8_cairo24_le (const Babl    *conversion,
 #if 0
 static void
 conv_rgbA8_cairo32_le (const Babl    *conversion,
-                       unsigned char *src,
-                       unsigned char *dst,
+                       unsigned char *__restrict__ src,
+                       unsigned char *__restrict__ dst,
                        long           samples)
 {
   long n = samples;
@@ -93,8 +93,8 @@ conv_rgbA8_cairo32_le (const Babl    *conversion,
 
 static void
 conv_rgbA8_cairo32_le (const Babl    *conversion,
-                       unsigned char *src, 
-                       unsigned char *dst, 
+                       unsigned char *__restrict__ src, 
+                       unsigned char *__restrict__ dst, 
                        long           samples)
 {
   long n = samples;
@@ -113,31 +113,10 @@ conv_rgbA8_cairo32_le (const Babl    *conversion,
 }
 #endif
 
-static void
-conv_cairo32_rgbA8_le (const Babl    *conversion,
-                       unsigned char *src,
-                       unsigned char *dst,
-                       long           samples)
-{
-  long n = samples;
-  while (n--)
-    {
-      unsigned char blue   = *src++;
-      unsigned char green  = *src++;
-      unsigned char red    = *src++;
-      unsigned char alpha  = *src++;
-
-      *dst++ = red;
-      *dst++ = green;
-      *dst++ = blue;
-      *dst++ = alpha;
-    }
-}
-
 static void 
 conv_cairo32_rgba8_le (const Babl    *conversion,
-                       unsigned char *src, 
-                       unsigned char *dst, 
+                       unsigned char *__restrict__ src, 
+                       unsigned char *__restrict__ dst, 
                        long           samples)
 {
   long n = samples;
@@ -164,18 +143,10 @@ conv_cairo32_rgba8_le (const Babl    *conversion,
       }
       else
       {
-        float falpha = alpha / 255.0;
-        float recip_alpha = 1.0 / falpha;
- //       unsigned int aa = ((255 << 16) + alpha) / falpha + 0.5;
-
-
-        *dst++ = ((red/255.0) * recip_alpha) * 255 + 0.5f;
-        *dst++ = ((green/255.0) * recip_alpha) * 255 + 0.5f;
-        *dst++ = ((blue/255.0) * recip_alpha) * 255 + 0.5f;
-
-//        *dst++ = (red   * aa + 0x8000) >> 16;
-//        *dst++ = (green * aa + 0x8000) >> 16;
-//        *dst++ = (blue  * aa + 0x8000) >> 16;
+        float falpha = alpha / 255.0f;
+        *dst++ = red / falpha + 0.5f;
+        *dst++ = green / falpha + 0.5f;
+        *dst++ = blue / falpha + 0.5f;
         *dst++ = alpha;
       }
     }
@@ -183,13 +154,12 @@ conv_cairo32_rgba8_le (const Babl    *conversion,
 
 
 static void
-conv_cairo32_rgbAF_le (const Babl    *conversion,
-                       unsigned char *src,
-                       unsigned char *dst_char,
+conv_cairo32_rgbA8_le (const Babl    *conversion,
+                       unsigned char *__restrict__ src,
+                       unsigned char *__restrict__ dst,
                        long           samples)
 {
   long n = samples;
-  float *dst = (void*)dst_char;
   while (n--)
     {
       unsigned char blue   = *src++;
@@ -197,18 +167,18 @@ conv_cairo32_rgbAF_le (const Babl    *conversion,
       unsigned char red    = *src++;
       unsigned char alpha  = *src++;
 
-      *dst++ = red / 255.0;
-      *dst++ = green / 255.0;
-      *dst++ = blue / 255.0;
-      *dst++ = alpha / 255.0;
+      *dst++ = red;
+      *dst++ = green;
+      *dst++ = blue;
+      *dst++ = alpha;
     }
 }
 
 
 static void
-conv_cairo32_rgbaF_le (const Babl    *conversion,
-                       unsigned char *src,
-                       unsigned char *dst_char,
+conv_cairo32_rgbAF_le (const Babl    *conversion,
+                       unsigned char *__restrict__ src,
+                       unsigned char *__restrict__ dst_char,
                        long           samples)
 {
   long n = samples;
@@ -220,24 +190,17 @@ conv_cairo32_rgbaF_le (const Babl    *conversion,
       unsigned char red    = *src++;
       unsigned char alpha  = *src++;
 
-      float reciprocal_alpha = 0.0f;
-
-      if (alpha)
-        reciprocal_alpha = 1.0f/(alpha/255.0f) / 255.0f;
-      
-
-      *dst++ = red * reciprocal_alpha;
-      *dst++ = green * reciprocal_alpha;
-      *dst++ = blue * reciprocal_alpha;
-      *dst++ = alpha / 255.0;
+      *dst++ = red / 255.0f;
+      *dst++ = green / 255.0f;
+      *dst++ = blue / 255.0f;
+      *dst++ = alpha / 255.0f;
     }
 }
 
-
 static void
 conv_cairo24_cairo32_le (const Babl    *conversion,
-                         unsigned char *src,
-                         unsigned char *dst,
+                         unsigned char *__restrict__ src,
+                         unsigned char *__restrict__ dst,
                          long           samples)
 {
   long n = samples;
@@ -252,9 +215,52 @@ conv_cairo24_cairo32_le (const Babl    *conversion,
 
 
 static void
+conv_cairo32_cairo24_le (const Babl    *conversion,
+                         unsigned char *__restrict__ src,
+                         unsigned char *__restrict__ dst,
+                         long           samples)
+{
+#if 0
+  close .. but not quite acceptble this is perfect resolution for 1/300
+          /// 1/255 perfect resolution is around 0.0039
+          //  maybe we are good enough! and babls fidelity once 
+          //
+          //  bad conversions are identified should be allowed to use
+          //  the faster practically correct but not truely reversible variants?
+          //
+
+        $ rm ~/.cache/babl/babl-fishes;  ninja && BABL_PATH=extensions BABL_TOLERANCE=0.1 ./tools/babl-verify cairo-ARGB32 cairo-RGB24
+[7/7] Linking target extensions/x86-64-v3-cairo.so
+extensions/x86-64-v3-cairo.so 0: cairo-ARGB32 to cairo-RGB24  error:0.002999 cost:219.000000
+
+
+#endif
+
+
+  long n = samples;
+  while (n--)
+    {
+      int alpha = src[3];
+      if (alpha)
+      {
+        float falpha = (alpha/255.0f);
+        for (int c = 0; c < 3; c++)
+          *dst++ = (*src++)/falpha + .5f;
+      }
+      else
+      {
+        *dst++ = (*src++);
+        *dst++ = (*src++);
+        *dst++ = (*src++);
+      }
+      *dst++ = 0;  src++;
+    }
+}
+
+static void
 conv_rgba8_cairo32_le (const Babl    *conversion,
-                       unsigned char *src,
-                       unsigned char *dst,
+                       unsigned char *__restrict__ src,
+                       unsigned char *__restrict__ dst,
                        long           samples)
 {
   long n = samples;
@@ -295,8 +301,8 @@ conv_rgba8_cairo32_le (const Babl    *conversion,
 
 static void
 conv_rgb8_cairo32_le (const Babl    *conversion,
-                      unsigned char *src, 
-                      unsigned char *dst, 
+                      unsigned char *__restrict__ src, 
+                      unsigned char *__restrict__ dst, 
                       long           samples)
 {
   long n = samples;
@@ -318,8 +324,8 @@ conv_rgb8_cairo32_le (const Babl    *conversion,
 
 static void
 conv_yA8_cairo32_le (const Babl    *conversion,
-                     unsigned char *src, 
-                     unsigned char *dst, 
+                     unsigned char *__restrict__ src, 
+                     unsigned char *__restrict__ dst, 
                      long           samples)
 {
   long n = samples;
@@ -342,8 +348,8 @@ conv_yA8_cairo32_le (const Babl    *conversion,
 
 static void
 conv_yA16_cairo32_le (const Babl    *conversion,
-                      unsigned char *src, 
-                      unsigned char *dst, 
+                      unsigned char *__restrict__ src, 
+                      unsigned char *__restrict__ dst, 
                       long           samples)
 {
   long n = samples;
@@ -362,8 +368,8 @@ conv_yA16_cairo32_le (const Babl    *conversion,
 
 static void
 conv_y8_cairo32_le (const Babl    *conversion,
-                    unsigned char *src, 
-                    unsigned char *dst, 
+                    unsigned char *__restrict__ src, 
+                    unsigned char *__restrict__ dst, 
                     long           samples)
 {
   long n = samples;
@@ -379,8 +385,8 @@ conv_y8_cairo32_le (const Babl    *conversion,
 
 static void
 conv_y16_cairo32_le (const Babl    *conversion,
-                     unsigned char *src, 
-                     unsigned char *dst, 
+                     unsigned char *__restrict__ src, 
+                     unsigned char *__restrict__ dst, 
                      long           samples)
 {
   long n = samples;
@@ -400,8 +406,8 @@ conv_y16_cairo32_le (const Babl    *conversion,
 
 static void
 conv_rgbA_gamma_float_cairo32_le (const Babl    *conversion,
-                                  unsigned char *src,
-                                  unsigned char *dst,
+                                  unsigned char *__restrict__ src,
+                                  unsigned char *__restrict__ dst,
                                   long           samples)
 {
   float *fsrc = (float *) src;
@@ -410,7 +416,7 @@ conv_rgbA_gamma_float_cairo32_le (const Babl    *conversion,
 
   while (n--)
     {
-      int val = fsrc[2] * 255.0f  + 0.5f;
+      int val = fsrc[2] * 255.0f + 0.5f;
       *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
       val = fsrc[1] * 255.0f + 0.5f;
       *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
@@ -424,8 +430,8 @@ conv_rgbA_gamma_float_cairo32_le (const Babl    *conversion,
 
 static void
 conv_rgbafloat_cairo32_le (const Babl    *conversion,
-                           unsigned char *src,
-                           unsigned char *dst,
+                           unsigned char *__restrict__ src,
+                           unsigned char *__restrict__ dst,
                            long           samples)
 {
   const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -441,17 +447,17 @@ conv_rgbafloat_cairo32_le (const Babl    *conversion,
       float green  = *fsrc++;
       float blue   = *fsrc++;
       float alpha  = *fsrc++;
-      if (alpha >= 1.0)
+      if (alpha >= 1.0f)
       {
-        int val = babl_trc_from_linear (trc[2], blue) * 0xff + 0.5f;
+        int val = babl_trc_from_linear (trc[2], blue) * 0xff;
         *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
-        val = babl_trc_from_linear (trc[1], green) * 0xff + 0.5f;
+        val = babl_trc_from_linear (trc[1], green) * 0xff;
         *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
-        val = babl_trc_from_linear (trc[0], red) * 0xff + 0.5f;
+        val = babl_trc_from_linear (trc[0], red) * 0xff;
         *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
         *cdst++ = 0xff;
       }
-      else if (alpha <= 0.0)
+      else if (alpha <= 0.0f)
       {
         (*(uint32_t*)cdst)=0;
         cdst+=4;
@@ -459,13 +465,14 @@ conv_rgbafloat_cairo32_le (const Babl    *conversion,
       else
       {
         float balpha = alpha * 0xff;
-        int val = babl_trc_from_linear (trc[2], blue) * balpha + 0.5f;
+        int val = babl_trc_from_linear (trc[2], blue) * balpha;
         *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
-        val = babl_trc_from_linear (trc[1], green) * balpha + 0.5f;
+        val = babl_trc_from_linear (trc[1], green) * balpha;
         *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
-        val = babl_trc_from_linear (trc[0], red) * balpha + 0.5f;
+        val = babl_trc_from_linear (trc[0], red) * balpha;
         *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
-        *cdst++ = balpha + 0.5f;
+        if (balpha > 255)balpha=255;
+        *cdst++ = balpha;
       }
     }
 }
@@ -473,8 +480,8 @@ conv_rgbafloat_cairo32_le (const Babl    *conversion,
 
 static void
 conv_yafloat_cairo32_le (const Babl    *conversion,
-                         unsigned char *src,
-                         unsigned char *dst,
+                         unsigned char *__restrict__ src,
+                         unsigned char *__restrict__ dst,
                          long           samples)
 {
   const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -487,16 +494,16 @@ conv_yafloat_cairo32_le (const Babl    *conversion,
     {
       float gray   = *fsrc++;
       float alpha  = *fsrc++;
-      if (alpha >= 1.0)
+      if (alpha >= 1.0f)
       {
-        int val = babl_trc_from_linear (trc[0], gray) * 0xff + 0.5f;
+        int val = babl_trc_from_linear (trc[0], gray) * 0xff;
         val = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
         *cdst++ = val;
         *cdst++ = val;
         *cdst++ = val;
         *cdst++ = 0xff;
       }
-      else if (alpha <= 0.0)
+      else if (alpha <= 0.0f)
       {
         (*(uint32_t*)cdst)=0;
         cdst+=4;
@@ -517,8 +524,8 @@ conv_yafloat_cairo32_le (const Babl    *conversion,
 
 static void
 conv_yafloat_nl_cairo32_le (const Babl    *conversion,
-                            unsigned char *src,
-                            unsigned char *dst,
+                            unsigned char *__restrict__ src,
+                            unsigned char *__restrict__ dst,
                             long           samples)
 {
   float *fsrc = (float *) src;
@@ -529,16 +536,16 @@ conv_yafloat_nl_cairo32_le (const Babl    *conversion,
     {
       float gray   = *fsrc++;
       float alpha  = *fsrc++;
-      if (alpha >= 1.0)
+      if (alpha >= 1.0f)
       {
-        int val = gray * 0xff + 0.5f;
+        int val = gray * 0xff;
         val = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
         *cdst++ = val;
         *cdst++ = val;
         *cdst++ = val;
         *cdst++ = 0xff;
       }
-      else if (alpha <= 0.0)
+      else if (alpha <= 0.0f)
       {
         (*(uint32_t*)cdst)=0;
         cdst+=4;
@@ -556,6 +563,7 @@ conv_yafloat_nl_cairo32_le (const Babl    *conversion,
     }
 }
 
+#include "babl-verify-cpu.inc"
 
 int
 init (void)
@@ -563,6 +571,7 @@ init (void)
   int   testint  = 23;
   char *testchar = (char*) &testint;
   int   littleendian = (testchar[0] == 23);
+  BABL_VERIFY_CPU();
 
   if (littleendian)
     {
@@ -601,12 +610,12 @@ init (void)
                            conv_cairo32_rgba8_le, NULL);
 
 
-      babl_conversion_new (f32, babl_format ("R'G'B'A float"), "linear",
-                           conv_cairo32_rgbaF_le, NULL);
-
       babl_conversion_new (f24, f32, "linear",
                            conv_cairo24_cairo32_le, NULL);
 
+      babl_conversion_new (f32, f24, "linear",
+                           conv_cairo32_cairo24_le, NULL);
+
       babl_conversion_new (babl_format ("R'aG'aB'aA u8"), f32, "linear",
                            conv_rgbA8_cairo32_le, NULL);
 
diff --git a/extensions/double.c b/extensions/double.c
index fe29cd9..21fc581 100644
--- a/extensions/double.c
+++ b/extensions/double.c
@@ -222,10 +222,13 @@ conv_rgbD_linear_rgbaD_linear (const Babl    *conversion,
   babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
 
 int init (void);
+#include "babl-verify-cpu.inc"
 
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
+  {
   const Babl *rgbaD_linear = babl_format_new (
     babl_model ("RGBA"),
     babl_type ("double"),
@@ -288,6 +291,7 @@ init (void)
   o (rgbaD_linear, rgbD_linear);
   o (rgbaD_gamma, rgbD_gamma);
 
+  }
   return 0;
 }
 
diff --git a/extensions/fast-float.c b/extensions/fast-float.c
index 8730046..659d60f 100644
--- a/extensions/fast-float.c
+++ b/extensions/fast-float.c
@@ -146,15 +146,15 @@ babl_lookup_new (BablLookupFunction function,
       end = u.f;
     }
 
-       if (precision <= 0.000005) shift =  0; /* checked for later */
-  else if (precision <= 0.000010) shift =  8;
-  else if (precision <= 0.000020) shift =  9;
-  else if (precision <= 0.000040) shift = 10;
-  else if (precision <= 0.000081) shift = 11;
-  else if (precision <= 0.000161) shift = 12;
-  else if (precision <= 0.000200) shift = 13;
-  else if (precision <= 0.000324) shift = 14;
-  else if (precision <= 0.000649) shift = 15;
+       if (precision <= 0.000005f) shift =  0; /* checked for later */
+  else if (precision <= 0.000010f) shift =  8;
+  else if (precision <= 0.000020f) shift =  9;
+  else if (precision <= 0.000040f) shift = 10;
+  else if (precision <= 0.000081f) shift = 11;
+  else if (precision <= 0.000161f) shift = 12;
+  else if (precision <= 0.000200f) shift = 13;
+  else if (precision <= 0.000324f) shift = 14;
+  else if (precision <= 0.000649f) shift = 15;
   else shift = 16; /* a bit better than 8bit sRGB quality */
 
 
@@ -162,16 +162,16 @@ babl_lookup_new (BablLookupFunction function,
    * causes lookups very close to zero to be passed directly to the
    * function instead.
    */
-  if (start == 0.0)
+  if (start == 0.0f)
     start = precision;
-  if (end == 0.0)
+  if (end == 0.0f)
     end = -precision;
 
   /* Compute start and */
 
-  if (start < 0.0 || end < 0.0)
+  if (start < 0.0f || end < 0.0f)
     {
-      if (end < 0.0)
+      if (end < 0.0f)
         {
           u.f = start;
           positive_max = (u.i << LSHIFT) >> shift;
@@ -301,7 +301,7 @@ conv_rgbaF_linear_rgbAF_gamma (const Babl    *conversion,
        float green = *fsrc++;
        float blue  = *fsrc++;
        float alpha = *fsrc++;
-       if (alpha == 1.0)
+       if (alpha == 1.0f)
        {
          *fdst++ = linear_to_gamma_2_2_lut (red);
          *fdst++ = linear_to_gamma_2_2_lut (green);
@@ -352,7 +352,7 @@ conv_rgbaF_linear_rgba8_gamma (const Babl    *conversion,
        *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
        val = linear_to_gamma_2_2_lut (blue) * 0xff + 0.5f;
        *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
-       val = alpha * 0xff + 0.5;
+       val = alpha * 0xff + 0.5f;
        *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
        }
      }
@@ -374,7 +374,7 @@ conv_rgbaF_linear_rgbA8_gamma (const Babl    *conversion,
        float green = *fsrc++;
        float blue  = *fsrc++;
        float alpha = *fsrc++;
-       if (alpha >= 1.0)
+       if (alpha >= 1.0f)
        {
          int val = linear_to_gamma_2_2_lut (red) * 0xff + 0.5f;
          *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
@@ -411,7 +411,7 @@ conv_yaF_linear_rgbA8_gamma (const Babl *conversion,unsigned char *src,
      {
        float gray = *fsrc++;
        float alpha = *fsrc++;
-       if (alpha >= 1.0)
+       if (alpha >= 1.0f)
        {
          int val = linear_to_gamma_2_2_lut (gray) * 0xff + 0.5f;
          *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
@@ -419,7 +419,7 @@ conv_yaF_linear_rgbA8_gamma (const Babl *conversion,unsigned char *src,
          *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
          *cdst++ = 0xff;
        }
-       else if (alpha <= 0.0)
+       else if (alpha <= 0.0f)
        {
          *((uint32_t*)(cdst))=0;
 	     cdst+=4;
@@ -453,7 +453,7 @@ conv_rgbaF_linear_rgbA8_gamma_cairo (const Babl *conversion,unsigned char *src,
       float green = *fsrc++;
       float blue  = *fsrc++;
       float alpha = *fsrc++;
-      if (alpha >= 1.0)
+      if (alpha >= 1.0f)
       {
         int val = linear_to_gamma_2_2_lut (blue) * 0xff + 0.5f;
         *cdst++ = val >= 0xff ? 0xff : val <= 0 ? 0 : val;
@@ -494,7 +494,7 @@ conv_rgbAF_linear_rgbAF_gamma (const Babl    *conversion,
       float blue  = *fsrc++;
       float alpha = *fsrc++;
 
-      if (alpha == 1.0)
+      if (alpha == 1.0f)
         {
           *fdst++ = linear_to_gamma_2_2_lut (red);
           *fdst++ = linear_to_gamma_2_2_lut (green);
@@ -503,7 +503,7 @@ conv_rgbAF_linear_rgbAF_gamma (const Babl    *conversion,
         }
       else
         {
-          float alpha_recip = 1.0 / alpha;
+          float alpha_recip = 1.0f / alpha;
           *fdst++ = linear_to_gamma_2_2_lut (red   * alpha_recip) * alpha;
           *fdst++ = linear_to_gamma_2_2_lut (green * alpha_recip) * alpha;
           *fdst++ = linear_to_gamma_2_2_lut (blue  * alpha_recip) * alpha;
@@ -589,11 +589,14 @@ conv_rgbF_gamma_rgbF_linear (const Babl    *conversion,
 #define o(src, dst) \
   babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
 
-int init (void);
+#include "babl-verify-cpu.inc"
 
+int init (void);
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
+  {
   const Babl *yaF_linear = babl_format_new (
     babl_model ("YA"),
     babl_type ("float"),
@@ -674,10 +677,10 @@ init (void)
     float a;
 
     /* tweaking the precision - does impact speed.. */
-    fast_pow = babl_lookup_new (core_lookup, NULL, 0.0, 1.0,   0.000199);
-    fast_rpow = babl_lookup_new (core_rlookup, NULL, 0.0, 1.0, 0.000250);
+    fast_pow = babl_lookup_new (core_lookup, NULL, 0.0f, 1.0f,   0.000199f);
+    fast_rpow = babl_lookup_new (core_rlookup, NULL, 0.0f, 1.0f, 0.000250f);
 
-    for (f = 0.0; f < 1.0; f+= 0.0000001)
+    for (f = 0.0; f < 1.0f; f+= 0.0000001f)
       {
         a = linear_to_gamma_2_2_lut (f);
         a = gamma_2_2_to_linear_lut (f);
@@ -713,6 +716,7 @@ init (void)
   o (rgbF_linear,  rgbF_gamma);
   o (rgbF_gamma,   rgbF_linear);
   o (yaF_linear,   rgbA8_gamma);
+  }
   return 0;
 }
 
diff --git a/extensions/float.c b/extensions/float.c
index cd34421..cbe042c 100644
--- a/extensions/float.c
+++ b/extensions/float.c
@@ -31,8 +31,8 @@ static const Babl *trc_srgb = NULL;
 
 static void
 conv_yaF_linear_yAF_linear (const Babl    *conversion,
-                            unsigned char *src,
-                            unsigned char *dst,
+                            unsigned char *__restrict__ src,
+                            unsigned char *__restrict__ dst,
                             long           samples)
 {
    float *fsrc = (float *) src;
@@ -52,8 +52,8 @@ conv_yaF_linear_yAF_linear (const Babl    *conversion,
 
 static void
 conv_yAF_linear_yaF_linear (const Babl    *conversion,
-                            unsigned char *src,
-                            unsigned char *dst,
+                            unsigned char *__restrict__ src,
+                            unsigned char *__restrict__ dst,
                             long           samples)
 {
    float *fsrc = (float *) src;
@@ -73,8 +73,8 @@ conv_yAF_linear_yaF_linear (const Babl    *conversion,
 
 static void
 conv_yaF_linear_yAF_nonlinear (const Babl    *conversion,
-                               unsigned char *src,
-                               unsigned char *dst,
+                               unsigned char *__restrict__ src,
+                               unsigned char *__restrict__ dst,
                                long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -96,8 +96,8 @@ conv_yaF_linear_yAF_nonlinear (const Babl    *conversion,
 
 static void
 conv_rgbaF_linear_rgbAF_nonlinear (const Babl    *conversion,
-                                   unsigned char *src,
-                                   unsigned char *dst,
+                                   unsigned char *__restrict__ src,
+                                   unsigned char *__restrict__ dst,
                                    long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -121,8 +121,8 @@ conv_rgbaF_linear_rgbAF_nonlinear (const Babl    *conversion,
 
 static void
 conv_rgbaF_linear_rgbAF_perceptual (const Babl    *conversion,
-                                    unsigned char *src,
-                                    unsigned char *dst,
+                                    unsigned char *__restrict__ src,
+                                    unsigned char *__restrict__ dst,
                                     long           samples)
 {
    float *fsrc = (float *) src;
@@ -144,8 +144,8 @@ conv_rgbaF_linear_rgbAF_perceptual (const Babl    *conversion,
 
 static void
 conv_rgbAF_linear_rgbAF_nonlinear (const Babl    *conversion,
-                                   unsigned char *src,
-                                   unsigned char *dst,
+                                   unsigned char *__restrict__ src,
+                                   unsigned char *__restrict__ dst,
                                    long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -168,7 +168,7 @@ conv_rgbAF_linear_rgbAF_nonlinear (const Babl    *conversion,
          }
        else
          {
-           float alpha_recip = 1.0 / alpha;
+           float alpha_recip = 1.0f / alpha;
            *fdst++ = babl_trc_from_linear (trc[0], *fsrc++ * alpha_recip) * alpha;
            *fdst++ = babl_trc_from_linear (trc[1], *fsrc++ * alpha_recip) * alpha;
            *fdst++ = babl_trc_from_linear (trc[2], *fsrc++ * alpha_recip) * alpha;
@@ -180,8 +180,8 @@ conv_rgbAF_linear_rgbAF_nonlinear (const Babl    *conversion,
 
 static void
 conv_yAF_linear_yAF_nonlinear (const Babl    *conversion,
-                               unsigned char *src,
-                               unsigned char *dst,
+                               unsigned char *__restrict__ src,
+                               unsigned char *__restrict__ dst,
                                long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -204,7 +204,7 @@ conv_yAF_linear_yAF_nonlinear (const Babl    *conversion,
          }
        else
          {
-           float alpha_recip = 1.0 / alpha;
+           float alpha_recip = 1.0f / alpha;
            *fdst++ = babl_trc_from_linear (trc[0], *fsrc++ * alpha_recip) * alpha;
            *fdst++ = *fsrc++;
          }
@@ -215,8 +215,8 @@ conv_yAF_linear_yAF_nonlinear (const Babl    *conversion,
 
 static void
 conv_rgbAF_linear_rgbAF_perceptual (const Babl    *conversion,
-                                    unsigned char *src,
-                                    unsigned char *dst,
+                                    unsigned char *__restrict__ src,
+                                    unsigned char *__restrict__ dst,
                                     long           samples)
 {
 
@@ -249,8 +249,8 @@ conv_rgbAF_linear_rgbAF_perceptual (const Babl    *conversion,
 
 static void
 conv_yaF_linear_yaF_nonlinear (const Babl    *conversion,
-                                   unsigned char *src, 
-                                   unsigned char *dst, 
+                                   unsigned char *__restrict__ src, 
+                                   unsigned char *__restrict__ dst, 
                                    long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -258,19 +258,13 @@ conv_yaF_linear_yaF_nonlinear (const Babl    *conversion,
 
    float *fsrc = (float *) src;
    float *fdst = (float *) dst;
-   int n = samples;
-
-   while (n--)
-     {
-       *fdst++ = babl_trc_from_linear (trc[0], *fsrc++);
-       *fdst++ = *fsrc++;
-     }
+   babl_trc_from_linear_buf (trc[0], fsrc, fdst, 2, 2, 1, samples);
 }
 
 static void
 conv_rgbaF_linear_rgbaF_nonlinear (const Babl    *conversion,
-                                   unsigned char *src, 
-                                   unsigned char *dst, 
+                                   unsigned char *__restrict__ src, 
+                                   unsigned char *__restrict__ dst, 
                                    long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -291,46 +285,33 @@ conv_rgbaF_linear_rgbaF_nonlinear (const Babl    *conversion,
 
 static void
 conv_rgbaF_linear_rgbaF_perceptual (const Babl    *conversion,
-                                    unsigned char *src, 
-                                    unsigned char *dst, 
+                                    unsigned char *__restrict__ src, 
+                                    unsigned char *__restrict__ dst, 
                                     long           samples)
 {
    float *fsrc = (float *) src;
    float *fdst = (float *) dst;
-   int n = samples;
-
-   while (n--)
-     {
-       *fdst++ = babl_trc_from_linear (trc_srgb, *fsrc++);
-       *fdst++ = babl_trc_from_linear (trc_srgb, *fsrc++);
-       *fdst++ = babl_trc_from_linear (trc_srgb, *fsrc++);
-       *fdst++ = *fsrc++;
-     }
+   babl_trc_from_linear_buf (trc_srgb, fsrc, fdst, 4, 4, 3, samples);
 }
 
 static void
 conv_yF_linear_yF_nonlinear (const Babl    *conversion,
-                             unsigned char *src,
-                             unsigned char *dst,
+                             unsigned char *__restrict__ src,
+                             unsigned char *__restrict__ dst,
                              long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
    const Babl **trc   = (void*)space->space.trc;
    float *fsrc = (float *) src;
    float *fdst = (float *) dst;
-   int n = samples;
-
-   while (n--)
-     {
-       *fdst++ = babl_trc_from_linear (trc[0], *fsrc++);
-     }
+   babl_trc_from_linear_buf (trc[0], fsrc, fdst, 1, 1, 1, samples);
 }
 
 
 static void
 conv_rgbF_linear_rgbF_nonlinear (const Babl    *conversion,
-                                 unsigned char *src,
-                                 unsigned char *dst,
+                                 unsigned char *__restrict__ src,
+                                 unsigned char *__restrict__ dst,
                                  long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -349,26 +330,19 @@ conv_rgbF_linear_rgbF_nonlinear (const Babl    *conversion,
 
 static void
 conv_rgbF_linear_rgbF_perceptual (const Babl    *conversion,
-                                  unsigned char *src,
-                                  unsigned char *dst,
+                                  unsigned char *__restrict__ src,
+                                  unsigned char *__restrict__ dst,
                                   long           samples)
 {
    float *fsrc = (float *) src;
    float *fdst = (float *) dst;
-   int n = samples;
-
-   while (n--)
-     {
-       *fdst++ = babl_trc_from_linear (trc_srgb, *fsrc++);
-       *fdst++ = babl_trc_from_linear (trc_srgb, *fsrc++);
-       *fdst++ = babl_trc_from_linear (trc_srgb, *fsrc++);
-     }
+   babl_trc_from_linear_buf (trc_srgb, fsrc, fdst, 3, 3, 3, samples);
 }
 
 static void
 conv_rgbaF_nonlinear_rgbaF_linear (const Babl    *conversion,
-                                   unsigned char *src,
-                                   unsigned char *dst,
+                                   unsigned char *__restrict__ src,
+                                   unsigned char *__restrict__ dst,
                                    long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -389,48 +363,34 @@ conv_rgbaF_nonlinear_rgbaF_linear (const Babl    *conversion,
 
 static void
 conv_yaF_nonlinear_yaF_linear (const Babl    *conversion,
-                               unsigned char *src,
-                               unsigned char *dst,
+                               unsigned char *__restrict__ src,
+                               unsigned char *__restrict__ dst,
                                long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
    const Babl **trc   = (void*)space->space.trc;
    float *fsrc = (float *) src;
    float *fdst = (float *) dst;
-   int n = samples;
-
-   while (n--)
-     {
-       *fdst++ = babl_trc_to_linear (trc[0], *fsrc++);
-       *fdst++ = *fsrc++;
-     }
+   babl_trc_to_linear_buf (trc[0], fsrc, fdst, 2, 2, 1, samples);
 }
 
 
 static void
 conv_rgbaF_perceptual_rgbaF_linear (const Babl    *conversion,
-                                    unsigned char *src,
-                                    unsigned char *dst,
+                                    unsigned char *__restrict__ src,
+                                    unsigned char *__restrict__ dst,
                                     long           samples)
 {
    float *fsrc = (float *) src;
    float *fdst = (float *) dst;
-   int n = samples;
-
-   while (n--)
-     {
-       *fdst++ = babl_trc_to_linear (trc_srgb, *fsrc++);
-       *fdst++ = babl_trc_to_linear (trc_srgb, *fsrc++);
-       *fdst++ = babl_trc_to_linear (trc_srgb, *fsrc++);
-       *fdst++ = *fsrc++;
-     }
+   babl_trc_to_linear_buf (trc_srgb, fsrc, fdst, 4, 4, 3, samples);
 }
 
 
 static void
 conv_rgbF_nonlinear_rgbF_linear (const Babl    *conversion,
-                                 unsigned char *src,
-                                 unsigned char *dst,
+                                 unsigned char *__restrict__ src,
+                                 unsigned char *__restrict__ dst,
                                  long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
@@ -450,38 +410,27 @@ conv_rgbF_nonlinear_rgbF_linear (const Babl    *conversion,
 
 static void
 conv_yF_nonlinear_yF_linear (const Babl    *conversion,
-                                 unsigned char *src,
-                                 unsigned char *dst,
+                                 unsigned char *__restrict__ src,
+                                 unsigned char *__restrict__ dst,
                                  long           samples)
 {
    const Babl  *space = babl_conversion_get_destination_space (conversion);
    const Babl **trc   = (void*)space->space.trc;
    float *fsrc = (float *) src;
    float *fdst = (float *) dst;
-   int n = samples;
 
-   while (n--)
-     {
-       *fdst++ = babl_trc_to_linear (trc[0], *fsrc++);
-     }
+   babl_trc_to_linear_buf (trc[0], fsrc, fdst, 1, 1, 1, samples);
 }
 
 static void
 conv_rgbF_perceptual_rgbF_linear (const Babl    *conversion,
-                                  unsigned char *src,
-                                  unsigned char *dst,
+                                  unsigned char *__restrict__ src,
+                                  unsigned char *__restrict__ dst,
                                   long           samples)
 {
    float *fsrc = (float *) src;
    float *fdst = (float *) dst;
-   int n = samples;
-
-   while (n--)
-     {
-       *fdst++ = babl_trc_to_linear (trc_srgb, *fsrc++);
-       *fdst++ = babl_trc_to_linear (trc_srgb, *fsrc++);
-       *fdst++ = babl_trc_to_linear (trc_srgb, *fsrc++);
-     }
+   babl_trc_to_linear_buf (trc_srgb, fsrc, fdst, 3, 3, 3, samples);
 }
 
 
@@ -489,10 +438,13 @@ conv_rgbF_perceptual_rgbF_linear (const Babl    *conversion,
   babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
 
 int init (void);
+#include "babl-verify-cpu.inc"
 
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
+  {
   const Babl *yaF_linear = babl_format_new (
     babl_model ("YA"),
     babl_type ("float"),
@@ -621,7 +573,7 @@ init (void)
   o (rgbaF_perceptual,  rgbaF_linear);
   o (rgbF_linear, rgbF_perceptual);
   o (rgbF_perceptual,  rgbF_linear);
-
+  }
   return 0;
 }
 
diff --git a/extensions/gegl-fixups.c b/extensions/gegl-fixups.c
index 45888ce..613a405 100644
--- a/extensions/gegl-fixups.c
+++ b/extensions/gegl-fixups.c
@@ -82,7 +82,7 @@ table_init (void)
       float    f;
       uint32_t s;
     } u;
-    u.f = 0.0;
+    u.f = 0.0f;
 
     //u.s[0] = 0;
 
@@ -91,15 +91,15 @@ table_init (void)
         int c;
         int cg;
 
-        if (u.f <= 0.0)
+        if (u.f <= 0.0f)
           {
             c  = 0;
             cg = 0;
           }
         else
           {
-            c  = (u.f * 255.1619) + 0.5;
-            cg = (linear_to_gamma_2_2 (u.f) * 255.1619) + 0.5;
+            c  = (u.f * 255.1619f) + 0.5f;
+            cg = (linear_to_gamma_2_2 (u.f) * 255.1619f) + 0.5f;
             if (cg > 255) cg = 255;
             if (c > 255) c = 255;
           }
@@ -524,10 +524,13 @@ conv_rgba8_rgb8 (const Babl    *conversion,
 #define conv_gamma_rgbAF_gamma_rgbaF   conv_rgbAF_rgbaF
 
 int init (void);
+#include "babl-verify-cpu.inc"
 
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
+  {
   const Babl *rgbaF = babl_format_new (
     babl_model ("RGBA"),
     babl_type ("float"),
@@ -621,5 +624,6 @@ init (void)
   o (rgba8, rgb8);
   o (ga8, rgbaF);
 
+  }
   return 0;
 }
diff --git a/extensions/gggl-lies.c b/extensions/gggl-lies.c
index 09c4a90..e903c86 100644
--- a/extensions/gggl-lies.c
+++ b/extensions/gggl-lies.c
@@ -50,8 +50,8 @@
 
 static void
 conv_F_8 (const Babl    *conversion,
-          unsigned char *src, 
-          unsigned char *dst, 
+          unsigned char *__restrict__ src, 
+          unsigned char *__restrict__ dst, 
           long           samples)
 {
   long n = samples;
@@ -59,17 +59,17 @@ conv_F_8 (const Babl    *conversion,
   while (n--)
     {
       float f = ((*(float *) src));
-      if (f < 0.0)
+      if (f < 0.0f)
         {
           *(unsigned char *) dst = 0;
         }
-      else if (f > 1.0)
+      else if (f > 1.0f)
         {
           *(unsigned char *) dst = 255;
         }
       else
         {
-          *(unsigned char *) dst = lrint (f * 255.0);
+          *(unsigned char *) dst = lrint (f * 255.0f);
         }
       dst += 1;
       src += 4;
@@ -78,8 +78,8 @@ conv_F_8 (const Babl    *conversion,
 
 static void
 conv_F_16 (const Babl    *conversion,
-           unsigned char *src, 
-           unsigned char *dst, 
+           unsigned char *__restrict__ src, 
+           unsigned char *__restrict__ dst, 
            long           samples)
 {
   long n = samples;
@@ -87,17 +87,17 @@ conv_F_16 (const Babl    *conversion,
   while (n--)
     {
       float f = ((*(float *) src));
-      if (f < 0.0)
+      if (f < 0.0f)
         {
           *(unsigned short *) dst = 0;
         }
-      else if (f > 1.0)
+      else if (f > 1.0f)
         {
           *(unsigned short *) dst = 65535;
         }
       else
         {
-          *(unsigned short *) dst = lrint (f * 65535.0);
+          *(unsigned short *) dst = lrint (f * 65535.0f);
         }
       dst += 2;
       src += 4;
@@ -108,15 +108,15 @@ conv_F_16 (const Babl    *conversion,
 
 static void
 conv_8_F (const Babl    *conversion,
-          unsigned char *src, 
-          unsigned char *dst, 
+          unsigned char *__restrict__ src, 
+          unsigned char *__restrict__ dst, 
           long           samples)
 {
   long n = samples;
 
   while (n--)
     {
-      (*(float *) dst) = (*(unsigned char *) src / 255.0);
+      (*(float *) dst) = (*(unsigned char *) src / 255.0f);
       dst             += 4;
       src             += 1;
     }
@@ -124,8 +124,8 @@ conv_8_F (const Babl    *conversion,
 
 static void
 conv_16_F (const Babl    *conversion,
-           unsigned char *src, 
-           unsigned char *dst, 
+           unsigned char *__restrict__ src, 
+           unsigned char *__restrict__ dst, 
            long           samples)
 {
   long n = samples;
@@ -140,8 +140,8 @@ conv_16_F (const Babl    *conversion,
 
 static void
 conv_F_D (const Babl    *conversion,
-          unsigned char *src, 
-          unsigned char *dst, 
+          unsigned char *__restrict__ src, 
+          unsigned char *__restrict__ dst, 
           long           samples)
 {
   long n = samples;
@@ -156,8 +156,8 @@ conv_F_D (const Babl    *conversion,
 
 static void
 conv_D_F (const Babl    *conversion,
-          unsigned char *src, 
-          unsigned char *dst, 
+          unsigned char *__restrict__ src, 
+          unsigned char *__restrict__ dst, 
           long           samples)
 {
   long n = samples;
@@ -172,8 +172,8 @@ conv_D_F (const Babl    *conversion,
 
 static void
 conv_16_8 (const Babl    *conversion,
-           unsigned char *src, 
-           unsigned char *dst, 
+           unsigned char *__restrict__ src, 
+           unsigned char *__restrict__ dst, 
            long           samples)
 {
   long n = samples;
@@ -200,8 +200,8 @@ conv_16_8 (const Babl    *conversion,
 
 static void
 conv_8_16 (const Babl    *conversion,
-           unsigned char *src, 
-           unsigned char *dst, 
+           unsigned char *__restrict__ src, 
+           unsigned char *__restrict__ dst, 
            long           samples)
 {
   long n = samples;
@@ -217,8 +217,8 @@ conv_8_16 (const Babl    *conversion,
 /*********/
 static void
 conv_rgbaF_rgba8 (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   conv_F_8 (conversion, src, dst, samples * 4);
@@ -228,8 +228,8 @@ conv_rgbaF_rgba8 (const Babl    *conversion,
 
 static void
 conv_rgbF_rgb8 (const Babl    *conversion,
-                unsigned char *src, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   conv_F_8 (conversion, src, dst, samples * 3);
@@ -237,8 +237,8 @@ conv_rgbF_rgb8 (const Babl    *conversion,
 
 static void
 conv_gaF_ga8 (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   conv_F_8 (conversion, src, dst, samples * 2);
@@ -250,8 +250,8 @@ conv_gaF_ga8 (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgba16 (const Babl    *conversion,
-                   unsigned char *src, 
-                   unsigned char *dst, 
+                   unsigned char *__restrict__ src, 
+                   unsigned char *__restrict__ dst, 
                    long           samples)
 {
   conv_F_16 (conversion, src, dst, samples * 4);
@@ -259,8 +259,8 @@ conv_rgbaF_rgba16 (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgbaD (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   conv_F_D (conversion, src, dst, samples * 4);
@@ -268,8 +268,8 @@ conv_rgbaF_rgbaD (const Babl    *conversion,
 
 static void
 conv_rgbaD_rgbaF (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   conv_D_F (conversion, src, dst, samples * 4);
@@ -277,8 +277,8 @@ conv_rgbaD_rgbaF (const Babl    *conversion,
 
 static void
 conv_rgbF_rgb16 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   conv_F_16 (conversion, src, dst, samples * 3);
@@ -286,8 +286,8 @@ conv_rgbF_rgb16 (const Babl    *conversion,
 
 static void
 conv_gaF_ga16 (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   conv_F_16 (conversion, src, dst, samples * 2);
@@ -301,8 +301,8 @@ conv_gaF_ga16 (const Babl    *conversion,
 
 static void
 conv_rgba8_rgbaF (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   conv_8_F (conversion, src, dst, samples * 4);
@@ -310,8 +310,8 @@ conv_rgba8_rgbaF (const Babl    *conversion,
 
 static void
 conv_rgb8_rgbF (const Babl    *conversion,
-                unsigned char *src, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   conv_8_F (conversion, src, dst, samples * 3);
@@ -319,8 +319,8 @@ conv_rgb8_rgbF (const Babl    *conversion,
 
 static void
 conv_ga8_gaF (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   conv_8_F (conversion, src, dst, samples * 2);
@@ -332,8 +332,8 @@ conv_ga8_gaF (const Babl    *conversion,
 
 static void
 conv_rgba16_rgbaF (const Babl    *conversion,
-                   unsigned char *src, 
-                   unsigned char *dst, 
+                   unsigned char *__restrict__ src, 
+                   unsigned char *__restrict__ dst, 
                    long           samples)
 {
   conv_16_F (conversion, src, dst, samples * 4);
@@ -341,8 +341,8 @@ conv_rgba16_rgbaF (const Babl    *conversion,
 
 static void
 conv_rgb16_rgbF (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   conv_16_F (conversion, src, dst, samples * 3);
@@ -350,8 +350,8 @@ conv_rgb16_rgbF (const Babl    *conversion,
 
 static void
 conv_ga16_gaF (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   conv_16_F (conversion, src, dst, samples * 2);
@@ -363,8 +363,8 @@ conv_ga16_gaF (const Babl    *conversion,
 
 static void
 conv_rgba16_rgba8 (const Babl    *conversion,
-                   unsigned char *src, 
-                   unsigned char *dst, 
+                   unsigned char *__restrict__ src, 
+                   unsigned char *__restrict__ dst, 
                    long           samples)
 {
   conv_16_8 (conversion, src, dst, samples * 4);
@@ -372,8 +372,8 @@ conv_rgba16_rgba8 (const Babl    *conversion,
 
 static void
 conv_rgb16_rgb8 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   conv_16_8 (conversion, src, dst, samples * 3);
@@ -381,8 +381,8 @@ conv_rgb16_rgb8 (const Babl    *conversion,
 
 static void
 conv_ga16_ga8 (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   conv_16_8 (conversion, src, dst, samples * 2);
@@ -394,8 +394,8 @@ conv_ga16_ga8 (const Babl    *conversion,
 
 static void
 conv_rgba8_rgba16 (const Babl    *conversion,
-                   unsigned char *src, 
-                   unsigned char *dst, 
+                   unsigned char *__restrict__ src, 
+                   unsigned char *__restrict__ dst, 
                    long           samples)
 {
   conv_8_16 (conversion, src, dst, samples * 4);
@@ -403,8 +403,8 @@ conv_rgba8_rgba16 (const Babl    *conversion,
 
 static void
 conv_rgb8_rgb16 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   conv_8_16 (conversion, src, dst, samples * 3);
@@ -412,8 +412,8 @@ conv_rgb8_rgb16 (const Babl    *conversion,
 
 static void
 conv_ga8_ga16 (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   conv_8_16 (conversion, src, dst, samples * 2);
@@ -427,8 +427,8 @@ conv_ga8_ga16 (const Babl    *conversion,
 
 static void
 conv_gaF_gAF (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   long n = samples;
@@ -448,8 +448,8 @@ conv_gaF_gAF (const Babl    *conversion,
 
 static void
 conv_gAF_gaF (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   long n = samples;
@@ -474,8 +474,8 @@ conv_gAF_gaF (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgbF (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples;
@@ -498,8 +498,8 @@ conv_rgbaF_rgbF (const Babl    *conversion,
 
 static void
 conv_gF_rgbaF (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   long n = samples;
@@ -522,8 +522,8 @@ conv_gF_rgbaF (const Babl    *conversion,
 
 static void
 conv_rgbF_rgbaF (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples;
@@ -546,8 +546,8 @@ conv_rgbF_rgbaF (const Babl    *conversion,
 
 static void
 conv_gaF_gF (const Babl    *conversion,
-             unsigned char *src, 
-             unsigned char *dst, 
+             unsigned char *__restrict__ src, 
+             unsigned char *__restrict__ dst, 
              long           samples)
 {
   long n = samples;
@@ -563,8 +563,8 @@ conv_gaF_gF (const Babl    *conversion,
 
 static void
 conv_gF_gaF (const Babl    *conversion,
-             unsigned char *src, 
-             unsigned char *dst, 
+             unsigned char *__restrict__ src, 
+             unsigned char *__restrict__ dst, 
              long           samples)
 {
   long n = samples;
@@ -587,8 +587,8 @@ conv_gF_gaF (const Babl    *conversion,
 
 static void
 conv_gF_rgbF (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   long n = samples;
@@ -608,8 +608,8 @@ conv_gF_rgbF (const Babl    *conversion,
 
 static void
 conv_gaF_rgbaF (const Babl    *conversion,
-                unsigned char *src, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   long n = samples;
@@ -636,8 +636,8 @@ conv_gaF_rgbaF (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgbA8 (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   long n = samples;
@@ -649,11 +649,11 @@ conv_rgbaF_rgbA8 (const Babl    *conversion,
 
       for (c = 0; c < 3; c++)
         {
-          *(unsigned char *) dst = lrint (((*(float *) src) * alpha) * 255.0);
+          *(unsigned char *) dst = lrint (((*(float *) src) * alpha) * 255.0f);
           dst                   += 1;
           src                   += 4;
         }
-      *(unsigned char *) dst = lrint (alpha * 255.0);
+      *(unsigned char *) dst = lrint (alpha * 255.0f);
       dst++;
       src += 4;
     }
@@ -661,8 +661,8 @@ conv_rgbaF_rgbA8 (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgb8 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples;
@@ -673,7 +673,7 @@ conv_rgbaF_rgb8 (const Babl    *conversion,
 
       for (c = 0; c < 3; c++)
         {
-          *(unsigned char *) dst = lrint ((*(float *) src) * 255.0);
+          *(unsigned char *) dst = lrint ((*(float *) src) * 255.0f);
           dst                   += 1;
           src                   += 4;
         }
@@ -683,8 +683,8 @@ conv_rgbaF_rgb8 (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgb16 (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   long n = samples;
@@ -695,7 +695,7 @@ conv_rgbaF_rgb16 (const Babl    *conversion,
 
       for (c = 0; c < 3; c++)
         {
-          *(unsigned short *) dst = lrint ((*(float *) src) * 65535.0);
+          *(unsigned short *) dst = lrint ((*(float *) src) * 65535.0f);
           dst                    += 2;
           src                    += 4;
         }
@@ -705,8 +705,8 @@ conv_rgbaF_rgb16 (const Babl    *conversion,
 
 static void
 conv_rgba8_rgbA8 (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   long n = samples;
@@ -736,8 +736,8 @@ conv_rgba8_rgbA8 (const Babl    *conversion,
 
 static void
 conv_rgbA8_rgba8 (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   long n = samples;
@@ -768,8 +768,8 @@ conv_rgbA8_rgba8 (const Babl    *conversion,
 
 static void
 conv_rgb8_rgba8 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples;
@@ -790,8 +790,8 @@ conv_rgb8_rgba8 (const Babl    *conversion,
 
 static void
 conv_rgba8_rgb8 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples;
@@ -807,10 +807,13 @@ conv_rgba8_rgb8 (const Babl    *conversion,
 }
 
 int init (void);
+#include "babl-verify-cpu.inc"
 
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
+  {
   const Babl *rgbaF = babl_format_new (
     babl_model ("RGBA"),
     babl_type ("float"),
@@ -1012,6 +1015,6 @@ init (void)
   o (rgb8, rgbA8);
   o (rgba8, rgb8);
   o (rgbaF, rgbA8);
-
+  }
   return 0;
 }
diff --git a/extensions/gggl.c b/extensions/gggl.c
index 34068f1..95ff2e8 100644
--- a/extensions/gggl.c
+++ b/extensions/gggl.c
@@ -30,6 +30,7 @@
 
 #include "babl.h"
 #include "extensions/util.h"
+#include "babl-verify-cpu.inc"
 
 /*
  * Implemented according to information read from:
@@ -51,8 +52,8 @@
 
 static void
 conv_F_8 (const Babl    *conversion,
-          unsigned char *src, 
-          unsigned char *dst, 
+          unsigned char *__restrict__ src, 
+          unsigned char *__restrict__ dst, 
           long           samples)
 {
   long n = samples;
@@ -60,7 +61,7 @@ conv_F_8 (const Babl    *conversion,
   while (n--)
     {
       float f    = ((*(float *) src));
-      int   uval = lrint (f * 255.0);
+      int   uval = lrint (f * 255.0f);
 
       if (uval < 0) uval = 0;
       if (uval > 255) uval = 255;
@@ -73,8 +74,8 @@ conv_F_8 (const Babl    *conversion,
 
 static void
 conv_F_16 (const Babl    *conversion,
-           unsigned char *src, 
-           unsigned char *dst, 
+           unsigned char *__restrict__ src, 
+           unsigned char *__restrict__ dst, 
            long           samples)
 {
   long n = samples;
@@ -82,17 +83,17 @@ conv_F_16 (const Babl    *conversion,
   while (n--)
     {
       float f = ((*(float *) src));
-      if (f < 0.0)
+      if (f < 0.0f)
         {
           *(unsigned short *) dst = 0;
         }
-      else if (f > 1.0)
+      else if (f > 1.0f)
         {
           *(unsigned short *) dst = 65535;
         }
       else
         {
-          *(unsigned short *) dst = lrint (f * 65535.0);
+          *(unsigned short *) dst = lrint (f * 65535.0f);
         }
       dst += 2;
       src += 4;
@@ -101,15 +102,15 @@ conv_F_16 (const Babl    *conversion,
 
 static void
 conv_8_F (const Babl    *conversion,
-          unsigned char *src, 
-          unsigned char *dst, 
+          unsigned char *__restrict__ src, 
+          unsigned char *__restrict__ dst, 
           long           samples)
 {
   long n = samples;
 
   while (n--)
     {
-      (*(float *) dst) = ((*(unsigned char *) src) / 255.0);
+      (*(float *) dst) = ((*(unsigned char *) src) / 255.0f);
       dst             += 4;
       src             += 1;
     }
@@ -117,8 +118,8 @@ conv_8_F (const Babl    *conversion,
 
 static void
 conv_16_F (const Babl    *conversion,
-           unsigned char *src, 
-           unsigned char *dst, 
+           unsigned char *__restrict__ src, 
+           unsigned char *__restrict__ dst, 
            long           samples)
 {
   long n = samples;
@@ -133,8 +134,8 @@ conv_16_F (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgb8 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples;
@@ -145,7 +146,7 @@ conv_rgbaF_rgb8 (const Babl    *conversion,
 
       for (c = 0; c < 3; c++)
         {
-          int val = rint ((*(float *) src) * 255.0);
+          int val = rint ((*(float *) src) * 255.0f);
           if (val < 0)
             *(unsigned char *) dst = 0;
           else if (val > 255)
@@ -161,8 +162,8 @@ conv_rgbaF_rgb8 (const Babl    *conversion,
 
 static void
 conv_F_D (const Babl    *conversion,
-          unsigned char *src, 
-          unsigned char *dst, 
+          unsigned char *__restrict__ src, 
+          unsigned char *__restrict__ dst, 
           long           samples)
 {
   long n = samples;
@@ -177,8 +178,8 @@ conv_F_D (const Babl    *conversion,
 
 static void
 conv_D_F (const Babl    *conversion,
-          unsigned char *src, 
-          unsigned char *dst, 
+          unsigned char *__restrict__ src, 
+          unsigned char *__restrict__ dst, 
           long           samples)
 {
   long n = samples;
@@ -193,8 +194,8 @@ conv_D_F (const Babl    *conversion,
 
 static void
 conv_16_8 (const Babl    *conversion,
-           unsigned char *src, 
-           unsigned char *dst, 
+           unsigned char *__restrict__ src, 
+           unsigned char *__restrict__ dst, 
            long           samples)
 {
   long n = samples;
@@ -221,8 +222,8 @@ conv_16_8 (const Babl    *conversion,
 
 static inline void
 conv_8_16 (const Babl    *conversion,
-           unsigned char *src, 
-           unsigned char *dst, 
+           unsigned char *__restrict__ src, 
+           unsigned char *__restrict__ dst, 
            long           samples)
 {
   long n = samples;
@@ -238,8 +239,8 @@ conv_8_16 (const Babl    *conversion,
 /*********/
 static void
 conv_rgbaF_rgba8 (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   conv_F_8 (conversion, src, dst, samples * 4);
@@ -247,8 +248,8 @@ conv_rgbaF_rgba8 (const Babl    *conversion,
 
 static void
 conv_rgbF_rgb8 (const Babl    *conversion,
-                unsigned char *src, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   conv_F_8 (conversion, src, dst, samples * 3);
@@ -256,8 +257,8 @@ conv_rgbF_rgb8 (const Babl    *conversion,
 
 static void
 conv_gaF_ga8 (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   conv_F_8 (conversion, src, dst, samples * 2);
@@ -269,8 +270,8 @@ conv_gaF_ga8 (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgba16 (const Babl    *conversion,
-                   unsigned char *src, 
-                   unsigned char *dst, 
+                   unsigned char *__restrict__ src, 
+                   unsigned char *__restrict__ dst, 
                    long           samples)
 {
   conv_F_16 (conversion, src, dst, samples * 4);
@@ -278,8 +279,8 @@ conv_rgbaF_rgba16 (const Babl    *conversion,
 
 static void
 conv_rgbF_rgb16 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   conv_F_16 (conversion, src, dst, samples * 3);
@@ -287,8 +288,8 @@ conv_rgbF_rgb16 (const Babl    *conversion,
 
 static void
 conv_gaF_ga16 (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   conv_F_16 (conversion, src, dst, samples * 2);
@@ -300,8 +301,8 @@ conv_gaF_ga16 (const Babl    *conversion,
 
 static void
 conv_rgba8_rgbaF (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   conv_8_F (conversion, src, dst, samples * 4);
@@ -310,8 +311,8 @@ conv_rgba8_rgbaF (const Babl    *conversion,
 
 static void
 conv_rgb8_rgbF (const Babl    *conversion,
-                unsigned char *src, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   conv_8_F (conversion, src, dst, samples * 3);
@@ -319,8 +320,8 @@ conv_rgb8_rgbF (const Babl    *conversion,
 
 static void
 conv_ga8_gaF (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   conv_8_F (conversion, src, dst, samples * 2);
@@ -332,8 +333,8 @@ conv_ga8_gaF (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgbaD (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   conv_F_D (conversion, src, dst, samples * 4);
@@ -341,8 +342,8 @@ conv_rgbaF_rgbaD (const Babl    *conversion,
 
 static void
 conv_rgbaD_rgbaF (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   conv_D_F (conversion, src, dst, samples * 4);
@@ -350,8 +351,8 @@ conv_rgbaD_rgbaF (const Babl    *conversion,
 
 static void
 conv_rgba16_rgbaF (const Babl    *conversion,
-                   unsigned char *src, 
-                   unsigned char *dst, 
+                   unsigned char *__restrict__ src, 
+                   unsigned char *__restrict__ dst, 
                    long           samples)
 {
   conv_16_F (conversion, src, dst, samples * 4);
@@ -359,8 +360,8 @@ conv_rgba16_rgbaF (const Babl    *conversion,
 
 static void
 conv_rgb16_rgbF (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   conv_16_F (conversion, src, dst, samples * 3);
@@ -368,8 +369,8 @@ conv_rgb16_rgbF (const Babl    *conversion,
 
 static void
 conv_ga16_gaF (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   conv_16_F (conversion, src, dst, samples * 2);
@@ -381,8 +382,8 @@ conv_ga16_gaF (const Babl    *conversion,
 
 static void
 conv_rgba16_rgba8 (const Babl    *conversion,
-                   unsigned char *src, 
-                   unsigned char *dst, 
+                   unsigned char *__restrict__ src, 
+                   unsigned char *__restrict__ dst, 
                    long           samples)
 {
   conv_16_8 (conversion, src, dst, samples * 4);
@@ -390,8 +391,8 @@ conv_rgba16_rgba8 (const Babl    *conversion,
 
 static void
 conv_rgb16_rgb8 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   conv_16_8 (conversion, src, dst, samples * 3);
@@ -399,8 +400,8 @@ conv_rgb16_rgb8 (const Babl    *conversion,
 
 static void
 conv_ga16_ga8 (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   conv_16_8 (conversion, src, dst, samples * 2);
@@ -412,8 +413,8 @@ conv_ga16_ga8 (const Babl    *conversion,
 
 static void
 conv_rgba8_rgba16 (const Babl    *conversion,
-                   unsigned char *src, 
-                   unsigned char *dst, 
+                   unsigned char *__restrict__ src, 
+                   unsigned char *__restrict__ dst, 
                    long           samples)
 {
   conv_8_16 (conversion, src, dst, samples * 4);
@@ -421,8 +422,8 @@ conv_rgba8_rgba16 (const Babl    *conversion,
 
 static void
 conv_rgb8_rgb16 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   conv_8_16 (conversion, src, dst, samples * 3);
@@ -430,8 +431,8 @@ conv_rgb8_rgb16 (const Babl    *conversion,
 
 static void
 conv_ga8_ga16 (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   conv_8_16 (conversion, src, dst, samples * 2);
@@ -445,8 +446,8 @@ conv_ga8_ga16 (const Babl    *conversion,
 
 static void
 conv_gaF_gAF (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   long n = samples;
@@ -466,8 +467,8 @@ conv_gaF_gAF (const Babl    *conversion,
 
 static void
 conv_gAF_gaF (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   long n = samples;
@@ -492,8 +493,8 @@ conv_gAF_gaF (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgbF (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples;
@@ -515,8 +516,8 @@ conv_rgbaF_rgbF (const Babl    *conversion,
 
 static void
 conv_rgbF_rgbaF (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples;
@@ -537,8 +538,8 @@ conv_rgbF_rgbaF (const Babl    *conversion,
 
 static void
 conv_gaF_gF (const Babl    *conversion,
-             unsigned char *src, 
-             unsigned char *dst, 
+             unsigned char *__restrict__ src, 
+             unsigned char *__restrict__ dst, 
              long           samples)
 {
   long n = samples;
@@ -554,8 +555,8 @@ conv_gaF_gF (const Babl    *conversion,
 
 static void
 conv_gF_gaF (const Babl    *conversion,
-             unsigned char *src, 
-             unsigned char *dst, 
+             unsigned char *__restrict__ src, 
+             unsigned char *__restrict__ dst, 
              long           samples)
 {
   long n = samples;
@@ -578,8 +579,8 @@ conv_gF_gaF (const Babl    *conversion,
 
 static void
 conv_gF_rgbF (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   long n = samples;
@@ -599,8 +600,8 @@ conv_gF_rgbF (const Babl    *conversion,
 
 static void
 conv_g8_rgb8 (const Babl    *conversion,
-              unsigned char *src, 
-              unsigned char *dst, 
+              unsigned char *__restrict__ src, 
+              unsigned char *__restrict__ dst, 
               long           samples)
 {
   long n = samples;
@@ -617,8 +618,8 @@ conv_g8_rgb8 (const Babl    *conversion,
 #define conv_g8_rgbA8  conv_g8_rgba8
 static void
 conv_g8_rgba8 (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   long n = samples;
@@ -636,8 +637,8 @@ conv_g8_rgba8 (const Babl    *conversion,
 
 static void
 conv_gaF_rgbaF (const Babl    *conversion,
-                unsigned char *src, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   long n = samples;
@@ -665,8 +666,8 @@ conv_gaF_rgbaF (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgbA8 (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   long n = samples;
@@ -678,11 +679,11 @@ conv_rgbaF_rgbA8 (const Babl    *conversion,
 
       for (c = 0; c < 3; c++)
         {
-          *(unsigned char *) dst = lrint (((*(float *) src) * alpha) * 255.0);
+          *(unsigned char *) dst = lrint (((*(float *) src) * alpha) * 255.0f);
           dst                   += 1;
           src                   += 4;
         }
-      *(unsigned char *) dst = lrint (alpha * 255.0);
+      *(unsigned char *) dst = lrint (alpha * 255.0f);
       dst++;
       src += 4;
     }
@@ -690,8 +691,8 @@ conv_rgbaF_rgbA8 (const Babl    *conversion,
 
 static void
 conv_rgbaF_rgb16 (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   long n = samples;
@@ -702,12 +703,12 @@ conv_rgbaF_rgb16 (const Babl    *conversion,
 
       for (c = 0; c < 3; c++)
         {
-          if ((*(float *) src) >= 1.0)
+          if ((*(float *) src) >= 1.0f)
             *(unsigned short *) dst = 65535;
           else if ((*(float *) src) <=0)
             *(unsigned short *) dst = 0;
           else
-            *(unsigned short *) dst = lrint ((*(float *) src) * 65535.0);
+            *(unsigned short *) dst = lrint ((*(float *) src) * 65535.0f);
           dst                    += 2;
           src                    += 4;
         }
@@ -717,8 +718,8 @@ conv_rgbaF_rgb16 (const Babl    *conversion,
 
 static void
 conv_rgbA16_rgbaF (const Babl    *conversion,
-                   unsigned char *src, 
-                   unsigned char *dst, 
+                   unsigned char *__restrict__ src, 
+                   unsigned char *__restrict__ dst, 
                    long           samples)
 {
   long n = samples;
@@ -732,11 +733,11 @@ conv_rgbA16_rgbaF (const Babl    *conversion,
       if (alpha == 0.0f)
         recip_alpha = 10000.0;
       else
-        recip_alpha = 1.0/alpha;
+        recip_alpha = 1.0f/alpha;
 
       for (c = 0; c < 3; c++)
         {
-          (*(float *) dst) = (*(unsigned short *) src / 65535.0) * recip_alpha;
+          (*(float *) dst) = (*(unsigned short *) src / 65535.0f) * recip_alpha;
           dst             += 4;
           src             += 2;
         }
@@ -748,8 +749,8 @@ conv_rgbA16_rgbaF (const Babl    *conversion,
 
 static void
 conv_gF_rgbaF (const Babl    *conversion,
-               unsigned char *src, 
-               unsigned char *dst, 
+               unsigned char *__restrict__ src, 
+               unsigned char *__restrict__ dst, 
                long           samples)
 {
   long n = samples;
@@ -772,8 +773,8 @@ conv_gF_rgbaF (const Babl    *conversion,
 
 /*
    static void
-   conv_rgb8_rgbaF (unsigned char *src,
-                 unsigned char *dst,
+   conv_rgb8_rgbaF (unsigned char *__restrict__ src,
+                 unsigned char *__restrict__ dst,
                  int samples)
    {
     long n=samples;
@@ -781,7 +782,7 @@ conv_gF_rgbaF (const Babl    *conversion,
         int c;
 
         for (c = 0; c < 3; c++) {
-            (*(float *) dst) = *(unsigned char *) src / 255.0;
+            (*(float *) dst) = *(unsigned char *) src / 255.0f;
             dst += 4;
             src += 1;
         }
@@ -791,8 +792,8 @@ conv_gF_rgbaF (const Babl    *conversion,
    }
 
    static void
-   conv_g8_rgbaF (unsigned char *src,
-               unsigned char *dst,
+   conv_g8_rgbaF (unsigned char *__restrict__ src,
+               unsigned char *__restrict__ dst,
                int samples)
    {
     long n=samples;
@@ -800,7 +801,7 @@ conv_gF_rgbaF (const Babl    *conversion,
         int c;
 
         for (c = 0; c < 3; c++) {
-            (*(float *) dst) = *(unsigned char *) src / 255.0;
+            (*(float *) dst) = *(unsigned char *) src / 255.0f;
             dst += 4;
         }
         src += 1;
@@ -810,8 +811,8 @@ conv_gF_rgbaF (const Babl    *conversion,
    }
 
    static void
-   conv_rgb16_rgbaF (unsigned char *src,
-                  unsigned char *dst,
+   conv_rgb16_rgbaF (unsigned char *__restrict__ src,
+                  unsigned char *__restrict__ dst,
                   int samples)
    {
     long n=samples;
@@ -830,8 +831,8 @@ conv_gF_rgbaF (const Babl    *conversion,
    }
 
    static void
-   conv_gF_rgbaF (unsigned char *src,
-               unsigned char *dst,
+   conv_gF_rgbaF (unsigned char *__restrict__ src,
+               unsigned char *__restrict__ dst,
                int samples)
    {
     long n=samples;
@@ -851,8 +852,8 @@ conv_gF_rgbaF (const Babl    *conversion,
  */
 static void
 conv_rgba8_rgbA8 (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   long n = samples;
@@ -882,8 +883,8 @@ conv_rgba8_rgbA8 (const Babl    *conversion,
 
 static void
 conv_rgbA8_rgba8 (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   long n = samples;
@@ -902,13 +903,13 @@ conv_rgbA8_rgba8 (const Babl    *conversion,
         }
       else
         {
-          float alpha = src[3]/255.0;
-          float ralpha = 1.0/alpha;
+          float alpha = src[3]/255.0f;
+          float ralpha = 1.0f/alpha;
           //unsigned aa = ((255 << 16)) / src[3];
           unsigned aa = ((1 << 10)) * ralpha;
-          *dst++ = (src[0] * aa + .5) / 1024.0 + 0.5;
-          *dst++ = (src[1] * aa +.5) / 1024.0 + 0.5;
-          *dst++ = (src[2] * aa +.5) / 1024.0 + 0.5;
+          *dst++ = (src[0] * aa + .5f) / 1024.0f + 0.5f;
+          *dst++ = (src[1] * aa +.5f) / 1024.0f + 0.5f;
+          *dst++ = (src[2] * aa +.5f) / 1024.0f + 0.5f;
           *dst++ = src[3];
         }
       src += 4;
@@ -917,8 +918,8 @@ conv_rgbA8_rgba8 (const Babl    *conversion,
 
 static void
 conv_rgb8_rgba8 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples-1;
@@ -938,8 +939,8 @@ conv_rgb8_rgba8 (const Babl    *conversion,
 
 static void
 conv_rgba8_rgb8 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples;
@@ -956,8 +957,8 @@ conv_rgba8_rgb8 (const Babl    *conversion,
 
 static void
 conv_rgbA8_rgb8 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 {
   long n = samples;
@@ -1014,8 +1015,8 @@ conv_rgbA8_rgb8 (const Babl    *conversion,
 
   static void
 conv_yuvaF_rgbaF (const Babl    *conversion,
-                  unsigned char *src, 
-                  unsigned char *dst, 
+                  unsigned char *__restrict__ src, 
+                  unsigned char *__restrict__ dst, 
                   long           samples)
 {
   float *src_f = (float *) src;
@@ -1031,9 +1032,9 @@ conv_yuvaF_rgbaF (const Babl    *conversion,
       U = src_f[1];
       V = src_f[2];
 
-      R = Y + 1.40200 * (V /*-0.5*/);
-      G = Y - 0.34414 * (U /*-0.5*/) -0.71414 * (V /*-0.5*/);
-      B = Y + 1.77200 * (U /*-0.5*/);
+      R = Y + 1.40200f * (V /*-0.5*/);
+      G = Y - 0.34414f * (U /*-0.5*/) -0.71414f * (V /*-0.5*/);
+      B = Y + 1.77200f * (U /*-0.5*/);
 
       dst_f[0] = R;
       dst_f[1] = G;
@@ -1048,8 +1049,8 @@ conv_yuvaF_rgbaF (const Babl    *conversion,
 
 static void
 conv_yuvF_rgbF (const Babl    *conversion,
-                unsigned char *src, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   float *src_f = (float *) src;
@@ -1065,9 +1066,9 @@ conv_yuvF_rgbF (const Babl    *conversion,
       U = src_f[1];
       V = src_f[2];
 
-      R = Y + 1.40200 * (V /*-0.5*/);
-      G = Y - 0.34414 * (U /*-0.5*/) -0.71414 * (V /*-0.5*/);
-      B = Y + 1.77200 * (U /*-0.5*/);
+      R = Y + 1.40200f * (V /*-0.5*/);
+      G = Y - 0.34414f * (U /*-0.5*/) -0.71414f * (V /*-0.5*/);
+      B = Y + 1.77200f * (U /*-0.5*/);
 
       dst_f[0] = R;
       dst_f[1] = G;
@@ -1083,6 +1084,8 @@ int init (void);
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
+  {
   const Babl *rgbaD = babl_format_new (
     babl_model ("R'G'B'A"),
     babl_type ("double"),
@@ -1298,5 +1301,6 @@ init (void)
   o (rgbaF, rgbaD);
   o (rgbaD, rgbaF);
 
+  }
   return 0;
 }
diff --git a/extensions/gimp-8bit.c b/extensions/gimp-8bit.c
index 326058d..3d81661 100644
--- a/extensions/gimp-8bit.c
+++ b/extensions/gimp-8bit.c
@@ -366,13 +366,13 @@ conv_rgbaF_linear_rgb8_linear (const Babl    *conversion,
 
   while (n--)
     {
-      v = rint (*fsrc++ * 255.0);
+      v = rint (*fsrc++ * 255.0f);
       *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v);
 
-      v = rint (*fsrc++ * 255.0);
+      v = rint (*fsrc++ * 255.0f);
       *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v);
 
-      v = rint (*fsrc++ * 255.0);
+      v = rint (*fsrc++ * 255.0f);
       *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v);
 
       fsrc++;
@@ -391,16 +391,16 @@ conv_rgbaF_linear_rgba8_linear (const Babl    *conversion,
 
   while (n--)
     {
-      v = rint (*fsrc++ * 255.0);
+      v = rint (*fsrc++ * 255.0f);
       *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v);
 
-      v = rint (*fsrc++ * 255.0);
+      v = rint (*fsrc++ * 255.0f);
       *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v);
 
-      v = rint (*fsrc++ * 255.0);
+      v = rint (*fsrc++ * 255.0f);
       *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v);
 
-      v = rint (*fsrc++ * 255.0);
+      v = rint (*fsrc++ * 255.0f);
       *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v);
     }
 }
diff --git a/extensions/grey.c b/extensions/grey.c
index 3f20842..c4e1a67 100644
--- a/extensions/grey.c
+++ b/extensions/grey.c
@@ -26,8 +26,8 @@
 
 static void
 conv_rgbaF_linear_y8_linear (const Babl    *conversion,
-                             unsigned char *src,
-                             unsigned char *dst,
+                             unsigned char *__restrict__ src,
+                             unsigned char *__restrict__ dst,
                              long           samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
@@ -48,15 +48,15 @@ conv_rgbaF_linear_y8_linear (const Babl    *conversion,
       value += *s++ * RGB_LUMINANCE_BLUE_FLOAT;
       s++;
 
-      v = rint (value * 255.0);
+      v = rint (value * 255.0f);
       *dst++ = (v < 0) ? 0 : ((v > 255) ? 255 : v);
     }
 }
 
 static void
 conv_rgbaF_linear_yF_linear (const Babl    *conversion,
-                             unsigned char *src,
-                             unsigned char *dst,
+                             unsigned char *__restrict__ src,
+                             unsigned char *__restrict__ dst,
                              long           samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
@@ -82,8 +82,8 @@ conv_rgbaF_linear_yF_linear (const Babl    *conversion,
 
 static void
 conv_rgbaF_linear_yaF_linear (const Babl    *conversion,
-                              unsigned char *src,
-                              unsigned char *dst,
+                              unsigned char *__restrict__ src,
+                              unsigned char *__restrict__ dst,
                               long           samples)
 {
   const Babl *space = babl_conversion_get_source_space (conversion);
@@ -109,8 +109,8 @@ conv_rgbaF_linear_yaF_linear (const Babl    *conversion,
 
 static void
 conv_yaF_linear_rgbaF_linear (const Babl    *conversion,
-                              unsigned char *src,
-                              unsigned char *dst,
+                              unsigned char *__restrict__ src,
+                              unsigned char *__restrict__ dst,
                               long           samples)
 {
   float *s = (float *) src;
@@ -130,10 +130,13 @@ conv_yaF_linear_rgbaF_linear (const Babl    *conversion,
 
 
 int init (void);
+#include "babl-verify-cpu.inc"
 
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
+  {
   babl_conversion_new (babl_format ("RGBA float"),
                        babl_format ("Y u8"),
                        "linear",
@@ -171,5 +174,6 @@ init (void)
                        conv_yaF_linear_rgbaF_linear,
                        NULL);
 
+  }
   return 0;
 }
diff --git a/extensions/half.c b/extensions/half.c
index f308e03..789caf6 100644
--- a/extensions/half.c
+++ b/extensions/half.c
@@ -412,10 +412,13 @@ conv2_rgbaF_rgbaHalf (const Babl  *conversion,
 #define conv_yAHalf_yAF conv_yaHalf_yaF
 
 int init (void);
+#include "babl-verify-cpu.inc"
 
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
+  {
   int i;
   const Babl *rgbaF_linear = babl_format_new (
     babl_model ("RGBA"),
@@ -617,5 +620,6 @@ init (void)
   CONV2(yaF,      yaHalf);
   CONV2(yF,       yHalf);
 
+  }
   return 0;
 }
diff --git a/extensions/meson.build b/extensions/meson.build
index 850793f..9935f29 100644
--- a/extensions/meson.build
+++ b/extensions/meson.build
@@ -6,6 +6,7 @@ no_cflags = []
 babl_ext_dep = [
   math,
   thread,
+  lcms,
 ]
 
 # Include directories
@@ -21,11 +22,26 @@ if platform_win32
   babl_ext_link_args += no_undefined
 endif
 
+autosimd_extensions = [
+  ['u16', no_cflags],
+  ['u32', no_cflags],
+  ['cairo', no_cflags],
+  ['grey', no_cflags],
+  ['gggl', no_cflags],
+  ['gggl-lies', no_cflags],
+  ['gegl-fixups', no_cflags],
+  ['CIE', sse2_cflags],
+  ['float', no_cflags],
+  ['double', no_cflags],
+  ['simple', no_cflags],
+  ['ycbcr', no_cflags],
+]
 
 extensions = [
   ['u16', no_cflags],
   ['u32', no_cflags],
   ['cairo', no_cflags],
+  ['oklab', no_cflags],
   ['CIE', sse2_cflags],
   ['double', no_cflags],
   ['fast-float', no_cflags],
@@ -54,10 +70,10 @@ extensions = [
 ]
 
 foreach ext : extensions
-  library(
+  shared_library(
     ext[0],
     ext[0] + '.c',
-    c_args: ext[1],
+    c_args: [ext[1], '-DBABL_SIMDFREE' ],
     include_directories: babl_ext_inc,
     link_with: babl,
     link_args: babl_ext_link_args,
@@ -67,3 +83,55 @@ foreach ext : extensions
     install_dir: babl_libdir / lib_name,
   )
 endforeach
+
+if host_cpu_family == 'x86_64'
+ 
+  foreach ext : autosimd_extensions
+    shared_library(
+      'x86-64-v2-' + ext[0],
+      ext[0] + '.c',
+      c_args: [ext[1]] + x86_64_v2_flags,
+      include_directories: babl_ext_inc,
+      link_with: babl,
+      link_args: babl_ext_link_args,
+      dependencies: babl_ext_dep,
+      name_prefix: '',
+      install: true,
+      install_dir: babl_libdir / lib_name,
+   )
+  endforeach
+
+  foreach ext : autosimd_extensions
+    shared_library(
+      'x86-64-v3-' + ext[0],
+      ext[0] + '.c',
+      c_args: [ext[1]] + x86_64_v3_flags,
+      include_directories: babl_ext_inc,
+      link_with: babl,
+      link_args: babl_ext_link_args,
+      dependencies: babl_ext_dep,
+      name_prefix: '',
+      install: true,
+      install_dir: babl_libdir / lib_name,
+   )
+  endforeach
+
+elif host_cpu_family == 'arm'
+  
+
+  foreach ext : autosimd_extensions
+    shared_library(
+      'arm-neon-' + ext[0],
+      ext[0] + '.c',
+      c_args: [ext[1]] + arm_neon_flags,
+      include_directories: babl_ext_inc,
+      link_with: babl,
+      link_args: babl_ext_link_args,
+      dependencies: babl_ext_dep,
+      name_prefix: '',
+      install: true,
+      install_dir: babl_libdir / lib_name,
+   )
+  endforeach
+
+endif
diff --git a/extensions/oklab.c b/extensions/oklab.c
new file mode 100644
index 0000000..4add977
--- /dev/null
+++ b/extensions/oklab.c
@@ -0,0 +1,852 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2005, 2014, 2019 Øyvind Kolås.
+ * Copyright (C) 2014, 2019 Elle Stone
+ * Copyright (C) 2009, Martin Nordholts
+ * Copyright (C) 2021, Mingye Wang <arthur2e5@aosc.io>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <https://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Björn Ottosson (2020). Oklab, a perceptual color space for image
+ * processing. https://bottosson.github.io/posts/oklab/
+ */
+
+#include "config.h"
+
+#include <math.h>
+#include <string.h>
+
+#include "babl-internal.h"
+#include "babl-matrix.h"
+#include "babl.h"
+#include "base/util.h"
+
+#define DEGREES_PER_RADIAN (180 / 3.14159265358979323846)
+#define RADIANS_PER_DEGREE (1 / DEGREES_PER_RADIAN)
+
+static void components (void);
+static void models (void);
+static void conversions (void);
+static void formats (void);
+
+int init (void);
+
+static int enable_lch = 0;
+ // the Oklch conversions are not fully symmetric,
+ // thus not allowing the tests to pass if we register
+ // the code
+
+int
+init (void)
+{
+  components ();
+  models ();
+  formats ();
+  conversions ();
+  return 0;
+}
+
+static void
+components (void)
+{
+  babl_component_new ("Ok L", "doc", "Luminance, range 0.0-100.0 in float",
+                      NULL);
+  babl_component_new ("Ok a", "chroma", "doc",
+                      "chroma component 0.0 is no saturation", NULL);
+  babl_component_new ("Ok b", "chroma", "doc",
+                      "chroma component 0.0 is no saturation", NULL);
+  babl_component_new ("Ok C", "chroma", "doc", "chrominance/saturation", NULL);
+  babl_component_new ("Ok H", "chroma", "doc", "hue value range 0.0-360.0",
+                      NULL);
+}
+
+static void
+models (void)
+{
+  babl_model_new ("name", "Oklab", "doc",
+                  "Oklab color model, a perceptually uniform space.",
+                  babl_component ("Ok L"), babl_component ("Ok a"),
+                  babl_component ("Ok b"), NULL);
+
+  babl_model_new (
+      "name", "OklabA", "doc", "Oklab color model with separate alpha.",
+      babl_component ("Ok L"), babl_component ("Ok a"),
+      babl_component ("Ok b"), babl_component ("A"), "alpha", NULL);
+
+  if (enable_lch)
+  {
+    babl_model_new ("name", "Oklch", "doc",
+                    "Cylindrical representation of Oklab.",
+                    babl_component ("Ok L"), babl_component ("Ok C"),
+                    babl_component ("Ok H"), NULL);
+
+    babl_model_new (
+        "name", "OklchA", "doc", "Oklch color model with separate alpha.",
+        babl_component ("Ok L"), babl_component ("Ok C"),
+        babl_component ("Ok H"), babl_component ("A"), "alpha", NULL);
+  }
+}
+
+static void
+formats (void)
+{
+  babl_format_new (
+    "name", "Oklab float",
+    babl_model ("Oklab"),
+    babl_type ("float"),
+    babl_component ("Ok L"),
+    babl_component ("Ok a"),
+    babl_component ("Ok b"),
+    NULL
+  );
+
+
+  babl_format_new (
+    "name", "Oklab alpha float",
+    babl_model ("OklabA"),
+    babl_type ("float"),
+    babl_component ("Ok L"),
+    babl_component ("Ok a"),
+    babl_component ("Ok b"),
+    babl_component ("A"),
+    NULL
+  );
+
+  if (enable_lch)
+  {
+  babl_format_new (
+    "name", "Oklch float",
+    babl_model ("Oklch"),
+    babl_type ("float"),
+    babl_component ("Ok L"),
+    babl_component ("Ok C"),
+    babl_component ("Ok H"),
+    NULL
+  );
+
+  babl_format_new (
+    "name", "Oklch alpha float",
+    babl_model ("OklchA"),
+    babl_type ("float"),
+    babl_component ("Ok L"),
+    babl_component ("Ok C"),
+    babl_component ("Ok H"),
+    babl_component ("A"),
+    NULL
+  );
+  }
+}
+
+/* Convertion routine (space definition). */
+/* It's all float. The original definition is in float. */
+static double M1[9] = {
+  +0.8189330101, +0.0329845436, +0.0482003018,
+  +0.3618667424, +0.9293118715, +0.2643662691,
+  -0.1288597137, +0.0361456387, +0.6338517070,
+};
+
+static double M2[9] = {
+  +0.2104542553, +0.7936177850, - 0.0040720468,
+  +1.9779984951, -2.4285922050, + 0.4505937099,
+  +0.0259040371, +0.7827717662, - 0.8086757660,
+};
+
+static float M1f[9];
+static float M2f[9];
+static float inv_M1f[9];
+static float inv_M2f[9];
+
+static double inv_M1[9];
+static double inv_M2[9];
+static int mat_ready;
+
+/* fast approximate cube root
+ * origin: http://www.hackersdelight.org/hdcodetxt/acbrt.c.txt
+ * permissions: http://www.hackersdelight.org/permissions.htm
+ */
+static inline float
+_cbrtf (float x)
+{
+  union
+  {
+    float f;
+    uint32_t i;
+  } u = { x };
+
+  u.i = u.i / 4 + u.i / 16;
+  u.i = u.i + u.i / 16;
+  u.i = u.i + u.i / 256;
+  u.i = 0x2a5137a0 + u.i;
+  u.f = 0.33333333f * (2.0f * u.f + x / (u.f * u.f));
+  u.f = 0.33333333f * (2.0f * u.f + x / (u.f * u.f));
+
+  return u.f;
+}
+
+static inline void
+XYZ_to_Oklab_step (double *xyz, double *lab_out)
+{
+  double lms[3];
+  babl_matrix_mul_vector (M1, xyz, lms);
+  for (int i = 0; i < 3; i++)
+    {
+      lms[i] = cbrt (lms[i]);
+    }
+  babl_matrix_mul_vector (M2, lms, lab_out);
+}
+
+static inline void
+XYZ_to_Oklab_stepf (float *xyz, float *lab_out)
+{
+  float lms[3];
+  babl_matrix_mul_vectorff (M1f, xyz, lms);
+  for (int i = 0; i < 3; i++)
+    {
+      lms[i] = _cbrtf (lms[i]);
+    }
+  babl_matrix_mul_vectorff (M2f, lms, lab_out);
+}
+
+static inline void
+Oklab_to_XYZ_stepf (float *lab, float *xyz_out)
+{
+  float lms[3];
+  babl_matrix_mul_vectorff (inv_M2f, lab, lms);
+  for (int i = 0; i < 3; i++)
+    {
+      lms[i] = lms[i] * lms[i] * lms[i];
+    }
+  babl_matrix_mul_vectorff (inv_M1f, lms, xyz_out);
+}
+
+static inline void
+Oklab_to_XYZ_step (double *lab, double *xyz_out)
+{
+  double lms[3];
+  babl_matrix_mul_vector (inv_M2, lab, lms);
+  for (int i = 0; i < 3; i++)
+    {
+      lms[i] = lms[i] * lms[i] * lms[i];
+    }
+  babl_matrix_mul_vector (inv_M1, lms, xyz_out);
+}
+
+static inline void
+ab_to_ch_step (double *ab, double *ch_out)
+{
+  double a = ab[0], b = ab[1];
+
+  ch_out[1] = sqrt (a * a + b * b);
+  ch_out[2] = atan2 (b, a) * DEGREES_PER_RADIAN;
+
+  // Keep H within the range 0-360
+  if (ch_out[2] < 0.0)
+    ch_out[2] += 360;
+}
+
+static inline void
+ab_to_ch_stepf (float *ab, float *ch_out)
+{
+  float a = ab[0], b = ab[1];
+
+  ch_out[1] = sqrtf (a * a + b * b);
+  ch_out[2] = atan2f (b, a) * DEGREES_PER_RADIAN;
+
+  // Keep H within the range 0-360
+  if (ch_out[2] < 0.0)
+    ch_out[2] += 360;
+}
+
+static inline void
+ch_to_ab_step (double *ch, double *ab_out)
+{
+  double c = ch[0], h = ch[1];
+
+  ab_out[0] = cos (h * RADIANS_PER_DEGREE) * c;
+  ab_out[1] = sin (h * RADIANS_PER_DEGREE) * c;
+}
+
+static inline void
+ch_to_ab_stepf (float *ch, float *ab_out)
+{
+  float c = ch[0], h = ch[1];
+
+  ab_out[0] = cosf (h * RADIANS_PER_DEGREE) * c;
+  ab_out[1] = sinf (h * RADIANS_PER_DEGREE) * c;
+}
+
+static inline void
+XYZ_to_Oklch_step (double *xyz, double *lch_out)
+{
+  XYZ_to_Oklab_step (xyz, lch_out);
+  ab_to_ch_step (lch_out + 1, lch_out + 1);
+}
+
+static inline void
+XYZ_to_Oklch_stepf (float *xyz, float *lch_out)
+{
+  XYZ_to_Oklab_stepf (xyz, lch_out);
+  ab_to_ch_stepf (lch_out + 1, lch_out + 1);
+}
+
+static inline void
+Oklch_to_XYZ_step (double *lch, double *xyz_out)
+{
+  double lab[3] = { lch[0], lch[1], lch[2] };
+  ch_to_ab_step (lab + 1, lab + 1);
+  Oklab_to_XYZ_step (lab, xyz_out);
+}
+
+static inline void
+Oklch_to_XYZ_stepf (float *lch, float *xyz_out)
+{
+  float lab[3] = { lch[0], lch[1], lch[2] };
+  ch_to_ab_stepf (lab + 1, lab + 1);
+  Oklab_to_XYZ_stepf (lab, xyz_out);
+}
+
+static inline void
+constants (void)
+{
+  double tmp[9];
+  double D65[3] = { 0.95047, 1.0, 1.08883 };
+  double D50[3] = { 0.96420288, 1.0, 0.82490540 };
+
+  if (mat_ready)
+    return;
+
+  babl_chromatic_adaptation_matrix (D50, D65, tmp);
+  babl_matrix_mul_matrix (tmp, M1, M1);
+
+  babl_matrix_invert (M1, inv_M1);
+  babl_matrix_invert (M2, inv_M2);
+
+  babl_matrix_to_float (M1, M1f);
+  babl_matrix_to_float (M2, M2f);
+  babl_matrix_to_float (inv_M1, inv_M1f);
+  babl_matrix_to_float (inv_M2, inv_M2f);
+
+  mat_ready = 1;
+}
+
+/* Convertion routine (glue and boilerplate). */
+static void
+rgba_to_laba_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_source_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      babl_space_to_xyzf (space, src, xyz);
+      XYZ_to_Oklab_stepf (xyz, dst);
+      dst[3] = src[3];
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+static void
+rgba_to_laba (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  double *src = (double*)src_, *dst = (double*)dst_;
+  const Babl *space = babl_conversion_get_source_space (conversion);
+
+  while (n--)
+    {
+      double xyz[3];
+      babl_space_to_xyz (space, src, xyz);
+      XYZ_to_Oklab_step (xyz, dst);
+      dst[3] = src[3];
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+static void
+rgba_to_lab_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_source_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      babl_space_to_xyzf (space, src, xyz);
+      XYZ_to_Oklab_stepf (xyz, dst);
+
+      src += 4;
+      dst += 3;
+    }
+}
+
+static void
+rgba_to_lab (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  double *src = (double *)src_, *dst = (double *)dst_;
+  const Babl *space = babl_conversion_get_source_space (conversion);
+
+  while (n--)
+    {
+      double xyz[3];
+      babl_space_to_xyz (space, src, xyz);
+      XYZ_to_Oklab_step (xyz, dst);
+
+      src += 4;
+      dst += 3;
+    }
+}
+
+static void
+rgba_to_lcha_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_source_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      babl_space_to_xyzf (space, src, xyz);
+      XYZ_to_Oklch_stepf (xyz, dst);
+      dst[3] = src[3];
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+static void
+rgba_to_lcha (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  double *src = (double *)src_, *dst = (double *)dst_;
+  const Babl *space = babl_conversion_get_source_space (conversion);
+
+  while (n--)
+    {
+      double xyz[3];
+      babl_space_to_xyz (space, src, xyz);
+      XYZ_to_Oklch_step (xyz, dst);
+      dst[3] = src[3];
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+static void
+rgba_to_lch_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_source_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      babl_space_to_xyzf (space, src, xyz);
+      XYZ_to_Oklch_stepf (xyz, dst);
+
+      src += 4;
+      dst += 3;
+    }
+}
+
+static void
+rgba_to_lch (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  double *src = (double *)src_, *dst = (double *)dst_;
+  const Babl *space = babl_conversion_get_source_space (conversion);
+
+  while (n--)
+    {
+      double xyz[3];
+      babl_space_to_xyz (space, src, xyz);
+      XYZ_to_Oklch_step (xyz, dst);
+
+      src += 4;
+      dst += 3;
+    }
+}
+
+static void
+rgb_to_lab_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_source_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      babl_space_to_xyzf (space, src, xyz);
+      XYZ_to_Oklab_stepf (xyz, dst);
+
+      src += 3;
+      dst += 3;
+    }
+}
+
+static void
+rgb_to_lch_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_source_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      babl_space_to_xyzf (space, src, xyz);
+      XYZ_to_Oklch_stepf (xyz, dst);
+
+      src += 3;
+      dst += 3;
+    }
+}
+
+static void
+lab_to_rgb_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      Oklab_to_XYZ_stepf (src, xyz);
+      babl_space_from_xyzf (space, xyz, dst);
+
+      src += 3;
+      dst += 3;
+    }
+}
+
+static void
+lab_to_rgba_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      Oklab_to_XYZ_stepf (src, xyz);
+      babl_space_from_xyzf (space, xyz, dst);
+      dst[3] = 1.0;
+
+      src += 3;
+      dst += 4;
+    }
+}
+
+static void
+lab_to_rgba (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  double *src = (double *)src_, *dst = (double *)dst_;
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+
+  while (n--)
+    {
+      double xyz[3];
+      Oklab_to_XYZ_step (src, xyz);
+      babl_space_from_xyz (space, xyz, dst);
+      dst[3] = 1.0;
+
+      src += 3;
+      dst += 4;
+    }
+}
+
+static void
+lch_to_rgb_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      Oklch_to_XYZ_stepf (src, xyz);
+      babl_space_from_xyzf (space, xyz, dst);
+
+      src += 3;
+      dst += 3;
+    }
+}
+
+static void
+laba_to_rgba_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      Oklab_to_XYZ_stepf (src, xyz);
+      babl_space_from_xyzf (space, xyz, dst);
+      dst[3] = src[3];
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+static void
+laba_to_rgba (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  double *src = (double *)src_, *dst = (double *)dst_;
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+
+  while (n--)
+    {
+      double xyz[3];
+      Oklab_to_XYZ_step (src, xyz);
+      babl_space_from_xyz (space, xyz, dst);
+      dst[3] = src[3];
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+static void
+lcha_to_rgba_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      Oklch_to_XYZ_stepf (src, xyz);
+      babl_space_from_xyzf (space, xyz, dst);
+      dst[3] = src[3];
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+static void
+lcha_to_rgba (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  double *src = (double *)src_, *dst = (double *)dst_;
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+
+  while (n--)
+    {
+      double xyz[3];
+      Oklch_to_XYZ_step (src, xyz);
+      babl_space_from_xyz (space, xyz, dst);
+      dst[3] = src[3];
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+
+static void
+lch_to_rgba_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+
+  while (n--)
+    {
+      float xyz[3];
+      Oklch_to_XYZ_stepf (src, xyz);
+      babl_space_from_xyzf (space, xyz, dst);
+      dst[3] = 1.0f;
+
+      src += 3;
+      dst += 4;
+    }
+}
+
+static void
+lch_to_rgba (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  double *src = (double *)src_, *dst = (double *)dst_;
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+
+  while (n--)
+    {
+      double xyz[3];
+      Oklch_to_XYZ_step (src, xyz);
+      babl_space_from_xyz (space, xyz, dst);
+      dst[3] = 1.0f;
+
+      src += 3;
+      dst += 4;
+    }
+}
+
+
+static void
+lch_to_lab_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+
+  while (n--)
+    {
+      dst[0] = src[0];
+      ch_to_ab_stepf (src + 1, dst + 1);
+
+      src += 3;
+      dst += 3;
+    }
+}
+
+static void
+lab_to_lch_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+
+  while (n--)
+    {
+      dst[0] = src[0];
+      ab_to_ch_stepf (src + 1, dst + 1);
+
+      src += 3;
+      dst += 3;
+    }
+}
+
+static void
+lcha_to_laba_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+
+  while (n--)
+    {
+      dst[0] = src[0];
+      ch_to_ab_stepf (src + 1, dst + 1);
+      dst[3] = src[3];
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+static void
+laba_to_lcha_float (const Babl *conversion, char *src_, char *dst_, long samples)
+{
+  long n = samples;
+  float *src = (float *)src_, *dst = (float *)dst_;
+
+  while (n--)
+    {
+      dst[0] = src[0];
+      ab_to_ch_stepf (src + 1, dst + 1);
+      dst[3] = src[3];
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+/* End conversion routines. */
+
+static void
+conversions (void)
+{
+  constants ();
+
+#define _pair(f1, f2, fwd, rev)                                               \
+  do                                                                          \
+    {                                                                         \
+      babl_conversion_new (babl_format (f1), babl_format (f2), "linear", fwd, \
+                           NULL);                                             \
+      babl_conversion_new (babl_format (f2), babl_format (f1), "linear", rev, \
+                           NULL);                                             \
+    }                                                                         \
+  while (0)
+
+  babl_conversion_new (babl_model("RGBA"),
+                       babl_model("OklabA"),
+                       "linear", rgba_to_laba,
+                       NULL);
+  babl_conversion_new (babl_model("OklabA"),
+                       babl_model("RGBA"),
+                       "linear", laba_to_rgba,
+                       NULL);
+
+  babl_conversion_new (babl_model("RGBA"),
+                       babl_model("Oklab"),
+                       "linear", rgba_to_lab,
+                       NULL);
+  babl_conversion_new (babl_model("Oklab"),
+                       babl_model("RGBA"),
+                       "linear", lab_to_rgba,
+                       NULL);
+
+  _pair ("RGB float", "Oklab float", rgb_to_lab_float, lab_to_rgb_float);
+  _pair ("RGBA float", "Oklab alpha float", rgba_to_laba_float, laba_to_rgba_float);
+  _pair ("RGBA float", "Oklab float", rgba_to_lab_float, lab_to_rgba_float);
+
+  if (enable_lch)
+  {
+  babl_conversion_new (babl_model("RGBA"),
+                       babl_model("OklchA"),
+                       "linear", rgba_to_lcha,
+                       NULL);
+  babl_conversion_new (babl_model("OklchA"),
+                       babl_model("RGBA"),
+                       "linear", lcha_to_rgba,
+                       NULL);
+
+  babl_conversion_new (babl_model("RGBA"),
+                       babl_model("Oklch"),
+                       "linear", rgba_to_lch,
+                       NULL);
+  babl_conversion_new (babl_model("Oklch"),
+                       babl_model("RGBA"),
+                       "linear", lch_to_rgba,
+                       NULL);
+  _pair ("RGBA float", "Oklch float", rgba_to_lch_float, lch_to_rgba_float);
+  _pair ("RGB float", "Oklch float", rgb_to_lch_float, lch_to_rgb_float);
+  _pair ("RGBA float", "Oklch alpha float", rgba_to_lcha_float, lcha_to_rgba_float);
+  
+  _pair ("Oklab float", "Oklch float", lab_to_lch_float, lch_to_lab_float);
+  _pair ("Oklab alpha float", "Oklch alpha float", laba_to_lcha_float, lcha_to_laba_float);
+  }
+  #undef _pair
+}
diff --git a/extensions/simple.c b/extensions/simple.c
index 627247a..549cec0 100644
--- a/extensions/simple.c
+++ b/extensions/simple.c
@@ -7,8 +7,8 @@ int init (void);
 
 static inline void
 float_to_u8_x1 (const Babl    *conversion,
-                unsigned char *src_char, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src_char, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   float *src = (float *)src_char;
@@ -24,8 +24,8 @@ float_to_u8_x1 (const Babl    *conversion,
 
 static inline void
 float_to_u8_x4 (const Babl    *conversion,
-                unsigned char *src_char, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src_char, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   float_to_u8_x1 (conversion, src_char, dst, samples * 4);
@@ -33,8 +33,8 @@ float_to_u8_x4 (const Babl    *conversion,
 
 static inline void
 float_to_u8_x3 (const Babl    *conversion,
-                unsigned char *src_char, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src_char, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   float_to_u8_x1 (conversion, src_char, dst, samples * 3);
@@ -42,8 +42,8 @@ float_to_u8_x3 (const Babl    *conversion,
 
 static inline void
 float_to_u8_x2 (const Babl    *conversion,
-                unsigned char *src_char, 
-                unsigned char *dst, 
+                unsigned char *__restrict__ src_char, 
+                unsigned char *__restrict__ dst, 
                 long           samples)
 {
   float_to_u8_x1 (conversion, src_char, dst, samples * 2);
@@ -53,8 +53,8 @@ float_to_u8_x2 (const Babl    *conversion,
 
 static inline void
 float_pre_to_u8_pre (const Babl    *conversion,
-                     unsigned char *src_char, 
-                     unsigned char *dst, 
+                     unsigned char *__restrict__ src_char, 
+                     unsigned char *__restrict__ dst, 
                      long           samples)
 {
   float *src = (float *)src_char;
@@ -86,8 +86,8 @@ float_pre_to_u8_pre (const Babl    *conversion,
 
 static inline void
 float_to_u16_x1 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   float *src    = (float *)src_char;
@@ -103,24 +103,24 @@ float_to_u16_x1 (const Babl    *conversion,
 }
 static inline void
 float_to_u16_x2 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   float_to_u16_x1 (conversion, src_char, dst_char, samples * 2);
 }
 static inline void
 float_to_u16_x3 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   float_to_u16_x1 (conversion, src_char, dst_char, samples * 3);
 }
 static inline void
 float_to_u16_x4 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   float_to_u16_x1 (conversion, src_char, dst_char, samples * 4);
@@ -128,8 +128,8 @@ float_to_u16_x4 (const Babl    *conversion,
 
 static inline void
 float_pre_to_u16_pre (const Babl    *conversion,
-                      unsigned char *src_char, 
-                      unsigned char *dst_char, 
+                      unsigned char *__restrict__ src_char, 
+                      unsigned char *__restrict__ dst_char, 
                       long           samples)
 {
   float *src = (float *)src_char;
@@ -154,8 +154,8 @@ float_pre_to_u16_pre (const Babl    *conversion,
 
 static inline void
 float_pre_to_u32_pre (const Babl    *conversion,
-                      unsigned char *src_char, 
-                      unsigned char *dst_char, 
+                      unsigned char *__restrict__ src_char, 
+                      unsigned char *__restrict__ dst_char, 
                       long           samples)
 {
   float *src = (float *)src_char;
@@ -181,8 +181,8 @@ float_pre_to_u32_pre (const Babl    *conversion,
 
 static inline void
 float_to_u32_x1 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   float *src    = (float *)src_char;
@@ -190,7 +190,7 @@ float_to_u32_x1 (const Babl    *conversion,
   long n = samples;
   while (n--)
     {
-      double r = src[0];
+      float r = src[0];
             
       dst[0] = (r >= 1.0f) ? 0xFFFFFFFF : ((r <= 0.0f) ? 0x0 : 0xFFFFFFFF * r + 0.5f);
       
@@ -200,24 +200,24 @@ float_to_u32_x1 (const Babl    *conversion,
 }
 static void
 float_to_u32_x2 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   float_to_u32_x1 (conversion, src_char, dst_char, samples * 2);
 }
 static void
 float_to_u32_x3 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   float_to_u32_x1 (conversion, src_char, dst_char, samples * 3);
 }
 static void
 float_to_u32_x4 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   float_to_u32_x1 (conversion, src_char, dst_char, samples * 4);
@@ -226,8 +226,8 @@ float_to_u32_x4 (const Babl    *conversion,
 
 static inline void
 u32_to_float (const Babl    *conversion,
-              unsigned char *src_char, 
-              unsigned char *dst_char, 
+              unsigned char *__restrict__ src_char, 
+              unsigned char *__restrict__ dst_char, 
               long           samples)
 {
   uint32_t *src = (uint32_t *)src_char;
@@ -243,8 +243,8 @@ u32_to_float (const Babl    *conversion,
 
 static void
 u32_to_float_x4 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   u32_to_float (conversion, src_char, dst_char, samples * 4);
@@ -252,8 +252,8 @@ u32_to_float_x4 (const Babl    *conversion,
 
 static void
 u32_to_float_x3 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   u32_to_float (conversion, src_char, dst_char, samples * 3);
@@ -262,8 +262,8 @@ u32_to_float_x3 (const Babl    *conversion,
 
 static void
 u32_to_float_x2 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   u32_to_float (conversion, src_char, dst_char, samples * 2);
@@ -272,8 +272,8 @@ u32_to_float_x2 (const Babl    *conversion,
 
 static inline void
 u16_to_float (const Babl    *conversion,
-              unsigned char *src_char, 
-              unsigned char *dst_char, 
+              unsigned char *__restrict__ src_char, 
+              unsigned char *__restrict__ dst_char, 
               long           samples)
 {
   uint16_t *src = (uint16_t *)src_char;
@@ -289,8 +289,8 @@ u16_to_float (const Babl    *conversion,
 
 static void
 u16_to_float_x4 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   u16_to_float (conversion, src_char, dst_char, samples * 4);
@@ -298,8 +298,8 @@ u16_to_float_x4 (const Babl    *conversion,
 
 static void
 u16_to_float_x3 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   u16_to_float (conversion, src_char, dst_char, samples * 3);
@@ -308,8 +308,8 @@ u16_to_float_x3 (const Babl    *conversion,
 
 static void
 u16_to_float_x2 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   u16_to_float (conversion, src_char, dst_char, samples * 2);
@@ -317,8 +317,8 @@ u16_to_float_x2 (const Babl    *conversion,
 
 static inline void
 yau16_rgbaf (const Babl    *conversion,
-             unsigned char *src_char, 
-             unsigned char *dst_char, 
+             unsigned char *__restrict__ src_char, 
+             unsigned char *__restrict__ dst_char, 
              long           samples)
 {
   uint16_t *src = (uint16_t *)src_char;
@@ -338,8 +338,8 @@ yau16_rgbaf (const Babl    *conversion,
 
 static inline void
 u8_to_float (const Babl    *conversion,
-              unsigned char *src_char, 
-              unsigned char *dst_char, 
+              unsigned char *__restrict__ src_char, 
+              unsigned char *__restrict__ dst_char, 
               long           samples)
 {
   uint8_t *src = (uint8_t *)src_char;
@@ -355,8 +355,8 @@ u8_to_float (const Babl    *conversion,
 
 static void
 u8_to_float_x4 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   u8_to_float (conversion, src_char, dst_char, samples * 4);
@@ -364,8 +364,8 @@ u8_to_float_x4 (const Babl    *conversion,
 
 static void
 u8_to_float_x3 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   u8_to_float (conversion, src_char, dst_char, samples * 3);
@@ -374,8 +374,8 @@ u8_to_float_x3 (const Babl    *conversion,
 
 static void
 u8_to_float_x2 (const Babl    *conversion,
-                 unsigned char *src_char, 
-                 unsigned char *dst_char, 
+                 unsigned char *__restrict__ src_char, 
+                 unsigned char *__restrict__ dst_char, 
                  long           samples)
 {
   u8_to_float (conversion, src_char, dst_char, samples * 2);
@@ -383,8 +383,8 @@ u8_to_float_x2 (const Babl    *conversion,
 
 static inline void
 yau8_rgbaf (const Babl    *conversion,
-             unsigned char *src_char, 
-             unsigned char *dst_char, 
+             unsigned char *__restrict__ src_char, 
+             unsigned char *__restrict__ dst_char, 
              long           samples)
 {
   uint8_t *src = (uint8_t *)src_char;
@@ -404,8 +404,8 @@ yau8_rgbaf (const Babl    *conversion,
 
 static inline void
 yu8_yau8  (const Babl    *conversion,
-           unsigned char *src_char,
-           unsigned char *dst_char,
+           unsigned char *__restrict__ src_char,
+           unsigned char *__restrict__ dst_char,
            long           samples)
 {
   uint8_t *src = (uint8_t *)src_char;
@@ -423,8 +423,8 @@ yu8_yau8  (const Babl    *conversion,
 
 static inline void
 yau8_yu8  (const Babl    *conversion,
-           unsigned char *src_char,
-           unsigned char *dst_char,
+           unsigned char *__restrict__ src_char,
+           unsigned char *__restrict__ dst_char,
            long           samples)
 {
   uint8_t *src = (uint8_t *)src_char;
@@ -442,8 +442,8 @@ yau8_yu8  (const Babl    *conversion,
 
 static inline void
 yu16_yau16  (const Babl    *conversion,
-             unsigned char *src_char,
-             unsigned char *dst_char,
+             unsigned char *__restrict__ src_char,
+             unsigned char *__restrict__ dst_char,
              long           samples)
 {
   uint16_t *src = (uint16_t *)src_char;
@@ -460,8 +460,8 @@ yu16_yau16  (const Babl    *conversion,
 
 static inline void
 yau16_yu16  (const Babl    *conversion,
-             unsigned char *src_char,
-             unsigned char *dst_char,
+             unsigned char *__restrict__ src_char,
+             unsigned char *__restrict__ dst_char,
              long           samples)
 {
   uint16_t *src = (uint16_t *)src_char;
@@ -476,9 +476,11 @@ yau16_yu16  (const Babl    *conversion,
 }
 
 
+#include "babl-verify-cpu.inc"
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
   /* float and u8 */
   babl_conversion_new (babl_format ("R'G'B'A float"),
                        babl_format ("R'G'B'A u8"),
diff --git a/extensions/sse-half.c b/extensions/sse-half.c
index cee3975..653d68f 100644
--- a/extensions/sse-half.c
+++ b/extensions/sse-half.c
@@ -313,7 +313,6 @@ init (void)
     }
 
 #endif /* defined(USE_SSE4_1) && defined(USE_F16C) && defined(ARCH_X86_64) */
-
   return 0;
 }
 
diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c
index 3757ffe..e4cd7cf 100644
--- a/extensions/sse2-float.c
+++ b/extensions/sse2-float.c
@@ -157,11 +157,8 @@ conv_rgbAF_linear_rgbaF_linear_shuffle (const Babl  *conversion,
   while (remainder--)
     {
       float alpha = src[3];
-      float recip;
-      if (alpha <= 0.0f)
-        recip = 0.0f;
-      else
-        recip = 1.0f/alpha;
+      float used_alpha = babl_epsilon_for_zero_float (alpha);
+      float recip = 1.0f/used_alpha;
       dst[0] = src[0] * recip;
       dst[1] = src[1] * recip;
       dst[2] = src[2] * recip;
diff --git a/extensions/sse2-int8.c b/extensions/sse2-int8.c
index 6da1b5b..e337fd1 100644
--- a/extensions/sse2-int8.c
+++ b/extensions/sse2-int8.c
@@ -328,7 +328,6 @@ init (void)
     }
 
 #endif
-
   return 0;
 }
 
diff --git a/extensions/sse4-int8.c b/extensions/sse4-int8.c
index d505fe5..6c61bde 100644
--- a/extensions/sse4-int8.c
+++ b/extensions/sse4-int8.c
@@ -222,7 +222,6 @@ init (void)
     }
 
 #endif
-
   return 0;
 }
 
diff --git a/extensions/two-table.c b/extensions/two-table.c
index 8becfee..05c4f64 100644
--- a/extensions/two-table.c
+++ b/extensions/two-table.c
@@ -249,6 +249,5 @@ init (void)
                        "linear",
                        conv_yafloat_linear_yau8_gamma,
                        NULL);
-
   return 0;
 }
diff --git a/extensions/u16.c b/extensions/u16.c
index 87d2907..0fe479f 100644
--- a/extensions/u16.c
+++ b/extensions/u16.c
@@ -27,8 +27,8 @@
 
 static void
 conv_rgbu16_rgbau16 (const Babl    *conversion,
-                     unsigned char *src, 
-                     unsigned char *dst, 
+                     unsigned char *__restrict__ src, 
+                     unsigned char *__restrict__ dst, 
                      long           samples)
 
 
@@ -48,8 +48,8 @@ conv_rgbu16_rgbau16 (const Babl    *conversion,
 
 static void
 conv_yu16_yau16 (const Babl    *conversion,
-                 unsigned char *src, 
-                 unsigned char *dst, 
+                 unsigned char *__restrict__ src, 
+                 unsigned char *__restrict__ dst, 
                  long           samples)
 
 
@@ -67,9 +67,12 @@ conv_yu16_yau16 (const Babl    *conversion,
 
 int init (void);
 
+#include "babl-verify-cpu.inc"
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
+{
   babl_conversion_new (
     babl_format ("R'G'B' u16"),
     babl_format ("R'G'B'A u16"),
@@ -111,5 +114,6 @@ init (void)
     "linear",
     conv_yu16_yau16,
     NULL);
+}
   return 0;
 }
diff --git a/extensions/u32.c b/extensions/u32.c
index f9d563d..02964ba 100644
--- a/extensions/u32.c
+++ b/extensions/u32.c
@@ -190,9 +190,11 @@ conv_yu32_yau32 (const Babl    *conversion,
 
 int init (void);
 
+#include "babl-verify-cpu.inc"
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
   babl_conversion_new (
     babl_format ("R'G'B'A u32"),
     babl_format ("R'G'B'A u16"),
diff --git a/extensions/ycbcr.c b/extensions/ycbcr.c
index fabc44d..1e779d7 100644
--- a/extensions/ycbcr.c
+++ b/extensions/ycbcr.c
@@ -32,9 +32,11 @@ static void formats     (void);
 int init (void);
 
 
+#include "babl-verify-cpu.inc"
 int
 init (void)
 {
+  BABL_VERIFY_CPU();
   components ();
   models ();
   conversions ();
diff --git a/git-version.h b/git-version.h
new file mode 100644
index 0000000..34250bc
--- /dev/null
+++ b/git-version.h
@@ -0,0 +1,6 @@
+#ifndef __BABL_GIT_VERSION_H__
+#define __BABL_GIT_VERSION_H__
+
+#define BABL_GIT_VERSION "BABL_0_1_104-6-g150294d"
+
+#endif /* __BABL_GIT_VERSION_H__ */
diff --git a/git-version.h.in b/git-version.h.in
new file mode 100644
index 0000000..41bdefb
--- /dev/null
+++ b/git-version.h.in
@@ -0,0 +1,6 @@
+#ifndef __BABL_GIT_VERSION_H__
+#define __BABL_GIT_VERSION_H__
+
+#define BABL_GIT_VERSION "@BABL_GIT_VERSION@"
+
+#endif /* __BABL_GIT_VERSION_H__ */
diff --git a/meson.build b/meson.build
index f010eec..00f2b11 100644
--- a/meson.build
+++ b/meson.build
@@ -1,7 +1,7 @@
 project('babl', 'c',
   license: 'LGPL3+',
-  version: '0.1.80',
-  meson_version: '>=0.54.0',
+  version: '0.1.106',
+  meson_version: '>=0.55.0',
   default_options: [
     'buildtype=debugoptimized'
   ],
@@ -30,6 +30,9 @@ buildtype = get_option('buildtype')
 babl_prefix = get_option('prefix')
 babl_libdir = join_paths(babl_prefix, get_option('libdir'))
 
+project_build_root = meson.current_build_dir()
+project_source_root = meson.current_source_dir()
+
 ################################################################################
 # Projects infos
 
@@ -52,7 +55,7 @@ lib_name    = meson.project_name() + '-' + api_version
 stability_version_number = (major_version != 0 ? minor_version : micro_version)
 stable = (stability_version_number % 2 == 0)
 
-conf.set10('BABL_UNSTABLE', not stable, Description:
+conf.set10('BABL_UNSTABLE', not stable, description:
   'Define to 1 if this is an unstable version of BABL.')
 
 conf.set       ('BABL_MAJOR_VERSION',    '@0@'.format(major_version))
@@ -90,8 +93,15 @@ elif host_cpu_family == 'ppc64'
   have_ppc = true
   conf.set10('ARCH_PPC',    true)
   conf.set10('ARCH_PPC64',  true)
+elif host_cpu_family == 'arm'
+  have_arm = true
+  conf.set10('ARCH_ARM',  true)
+elif host_cpu_family == 'aarch64'
+  have_aarch64 = true
+  conf.set10('ARCH_AARCH64',  true)
 endif
 
+
 host_os = host_machine.system()
 message('Host os: ' + host_os)
 
@@ -156,9 +166,10 @@ if buildtype == 'debugoptimized' or buildtype == 'release'
   common_c_flags += cc.get_supported_arguments(['-Ofast'])
 endif
 common_c_flags += cc.get_supported_arguments(
-  ['-fno-unsafe-math-optimizations']
+  ['-fno-unsafe-math-optimizations','-ftree-vectorize']
 )
 
+
 extra_warnings_list = [
   '-Wdeclaration-after-statement',
   '-Winit-self',
@@ -178,6 +189,19 @@ else
   no_undefined = []
 endif
 
+if host_cpu_family == 'x86_64'
+  x86_64_v2_flags = cc.get_supported_arguments(['-march=x86-64','-msse2', '-msse2','-msse4.1','-msse4.2','-mpopcnt','-mssse3'])
+  x86_64_v3_flags = x86_64_v2_flags + cc.get_supported_arguments(['-mavx','-mavx2','-mf16c','-mfma','-mmovbe', '-mbmi', '-mbmi2'])
+
+  x86_64_v2_flags += '-DX86_64_V2'
+  x86_64_v3_flags += '-DX86_64_V3'
+
+elif host_cpu_family == 'arm'
+  arm_neon_flags = cc.get_supported_arguments(['-mfpu=neon-vfpv4'])
+  arm_neon_flags += '-DARM_NEON'
+elif host_cpu_family == 'aarch64'
+  common_c_flags += cc.get_supported_arguments(['-mfpu=neon-fp-armv8'])
+endif
 
 ################################################################################
 # Check for compiler CPU extensions
@@ -409,18 +433,59 @@ endif
 ################################################################################
 # Configuration files
 
+# This should not be made visible in babl_dep due to possible name clash
+# when built as a sub-project.
+rootInclude = include_directories('.')
+
 # config.h
 configure_file(
   output: 'config.h',
   configuration: conf
 )
 
+# If git is available, always check if git-version.h should be
+# updated. If git is not available, don't do anything if git-version.h
+# already exists because then we are probably working with a tarball
+# in which case the git-version.h we ship is correct.
+if git_bin.found() and run_command(
+    git_bin,
+    'rev-parse',
+    '--is-inside-work-tree',
+).returncode() == 0
+  git_version_h = vcs_tag(
+    input : 'git-version.h.in',
+    output: 'git-version.h',
+    replace_string: '@BABL_GIT_VERSION@',
+    command: [ git_bin.path(), 'describe', '--always' ],
+  )
+
+  if not meson.is_subproject()
+    meson.add_dist_script(
+      [ 'ninja', 'git-version.h', ],
+    )
+    meson.add_dist_script(
+      [ 'sh', '-c', ' '.join(
+      [ 'cp', git_version_h.full_path(), '${MESON_DIST_ROOT}' ]
+      )]
+    )
+  endif
+else
+  git_version_h = files('git-version.h')
+endif
+
+################################################################################
+# Global variables
+
+xml_insert_file = files('tools' / 'xml-insert.py')
+authors_file = files('AUTHORS')
+news_file = files('NEWS')
+todo_file = files('TODO')
+export_symbols_file = files('export-symbols')
+gen_babl_map_file = files('gen_babl_map.py')
 
 ################################################################################
 # Subdirs
 
-rootInclude = include_directories('.')
-
 subdir('babl')
 subdir('extensions')
 subdir('tests')
@@ -428,6 +493,7 @@ subdir('tools')
 if build_docs
   subdir('docs')
 endif
+subdir('bin')
 
 # Create README file from web page
 if w3m_bin.found() and build_docs
@@ -447,18 +513,33 @@ endif
 
 
 # pkg-config file
-pkgconfig.generate(filebase: 'babl',
+pkgconfig.generate(
+  babl,
+  filebase: 'babl-' + api_version,
   name: 'babl',
   description: 'Pixel encoding and color space conversion engine.',
   version: meson.project_version(),
-  libraries: [ babl ],
-  libraries_private: [
-    '-lm',
+  subdirs: lib_name,
+  uninstalled_variables: [
+    'babl_path=@0@'.format(babl_extensions_build_dir),
+    'babl_libdir=@0@'.format(babl_library_build_dir),
   ],
-  subdirs: [
-    lib_name,
+)
+
+# dependency for wrap builds
+babl_dep = declare_dependency(
+  include_directories: bablInclude,
+  link_with : babl,
+  sources: [
+    babl_version_h,
+    build_gir ? babl_gir : []
   ],
+  variables: {
+    'babl_path'   : babl_extensions_build_dir,
+    'babl_libdir' : babl_library_build_dir,
+  },
 )
+meson.override_dependency('babl-' + api_version, babl_dep)
 
 ################################################################################
 # Build summary
diff --git a/subprojects/lcms2.wrap b/subprojects/lcms2.wrap
new file mode 100644
index 0000000..2cc69df
--- /dev/null
+++ b/subprojects/lcms2.wrap
@@ -0,0 +1,12 @@
+[wrap-file]
+directory = Little-CMS-2.12
+source_url = https://github.com/mm2/Little-CMS/archive/refs/tags/2.12.tar.gz
+source_filename = lcms2-2.12.tar.gz
+source_hash = e501f1482fc424550ef3abbf86bf1c66090e1661249e89552d39ed5bf935df66
+patch_filename = lcms2_2.12-2_patch.zip
+patch_url = https://wrapdb.mesonbuild.com/v2/lcms2_2.12-2/get_patch
+patch_hash = 3ac6944ac4b8d8507b85961d98cb287532945183d0e8f094c77810e793b6bebe
+
+[provide]
+lcms2 = liblcms2_dep
+
diff --git a/tests/alpha_symmetric_transform.c b/tests/alpha_symmetric_transform.c
index a42709a..aff23ce 100644
--- a/tests/alpha_symmetric_transform.c
+++ b/tests/alpha_symmetric_transform.c
@@ -100,8 +100,7 @@ test (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (test ())
diff --git a/tests/babl_class_name.c b/tests/babl_class_name.c
index c0724f0..caec910 100644
--- a/tests/babl_class_name.c
+++ b/tests/babl_class_name.c
@@ -63,8 +63,7 @@ test (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (test ())
diff --git a/tests/cairo_cmyk_hack.c b/tests/cairo_cmyk_hack.c
index 1f308fe..f68cd9a 100644
--- a/tests/cairo_cmyk_hack.c
+++ b/tests/cairo_cmyk_hack.c
@@ -69,8 +69,7 @@ test (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (test ())
diff --git a/tests/chromaticities.c b/tests/chromaticities.c
index a57cead..963650c 100644
--- a/tests/chromaticities.c
+++ b/tests/chromaticities.c
@@ -66,8 +66,7 @@ test (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (test ())
diff --git a/tests/cmyk.c b/tests/cmyk.c
index 3b11e44..cc29963 100644
--- a/tests/cmyk.c
+++ b/tests/cmyk.c
@@ -23,8 +23,7 @@
 
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   int OK = 1;
 
diff --git a/tests/common.inc b/tests/common.inc
index bca5056..2016d04 100644
--- a/tests/common.inc
+++ b/tests/common.inc
@@ -1,26 +1,23 @@
-
+#include <stdlib.h>
 #include <math.h>
 #include "babl/babl-introspect.h"
 
 #define CHECK_CONV(test_name, componenttype, src_fmt, dst_fmt, src_pix, expected_pix) \
   {       \
-  const Babl *fish;       \
-  int i;       \
-  fish = babl_fish (src_fmt, dst_fmt);       \
+  const Babl *fish = babl_fish (src_fmt, dst_fmt);       \
   if (!fish)       \
     {       \
       printf ("  %s failed to make fish\n", test_name);       \
       OK = 0;       \
     }       \
-  for (i = 0; i < sizeof(src_pix)/sizeof(src_pix[0]); i ++)       \
+  for (size_t i = 0; i < sizeof(src_pix)/sizeof(src_pix[0]); i ++)       \
     {       \
-      int c;\
       componenttype result[10];       \
       babl_process (fish, src_pix[i], result, 1);       \
-      for (c = 0; c < sizeof(expected_pix[i])/sizeof(expected_pix[i][0]); c++) \
+      for (size_t c = 0; c < sizeof(expected_pix[i])/sizeof(expected_pix[i][0]); c++) \
       if (result[c] != expected_pix[i][c])       \
         {       \
-          printf (" %s failed #%i[%i]  got %i expected %i\n", test_name, i, c, result[c], expected_pix[i][c]);       \
+          printf (" %s failed #%li[%li]  got %i expected %i\n", test_name, i, c, result[c], expected_pix[i][c]);       \
           OK = 0;          \
           babl_introspect((Babl *)fish); \
         }       \
@@ -29,23 +26,20 @@
 
 #define CHECK_CONV_FLOAT(test_name, componenttype, max_error, src_fmt, dst_fmt, src_pix, expected_pix) \
   {       \
-  const Babl *fish;       \
-  int i;       \
-  fish = babl_fish (src_fmt, dst_fmt);       \
+  const Babl *fish = babl_fish (src_fmt, dst_fmt);       \
   if (!fish)       \
     {       \
       printf ("  %s failed to make fish\n", test_name);       \
       OK = 0;       \
     }       \
-  for (i = 0; i < sizeof(src_pix)/sizeof(src_pix[0]); i ++)       \
+  for (size_t i = 0; i < sizeof(src_pix)/sizeof(src_pix[0]); i ++)       \
     {       \
-      int c;\
       componenttype result[10];       \
       babl_process (fish, src_pix[i], result, 1);       \
-      for (c = 0; c < sizeof(expected_pix[i])/sizeof(expected_pix[i][0]); c++) \
+      for (size_t c = 0; c < sizeof(expected_pix[i])/sizeof(expected_pix[i][0]); c++) \
       if (fabs(result[c] - expected_pix[i][c]) > max_error)       \
         {       \
-          printf (" %s failed #%i[%i]  got %lf expected %lf\n", test_name, i, c, result[c], expected_pix[i][c]);       \
+          printf (" %s failed #%li[%li]  got %lf expected %lf\n", test_name, i, c, result[c], expected_pix[i][c]);       \
           OK = 0;          \
           babl_introspect((Babl *)fish); \
         }       \
diff --git a/tests/concurrency-stress-test.c b/tests/concurrency-stress-test.c
index a02a519..bf0ffff 100644
--- a/tests/concurrency-stress-test.c
+++ b/tests/concurrency-stress-test.c
@@ -49,8 +49,7 @@ babl_fish_path_stress_test_thread_func (void *not_used)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   pthread_t threads[N_THREADS];
   int       i;
diff --git a/tests/conversions.c b/tests/conversions.c
index 9503d04..d699b21 100644
--- a/tests/conversions.c
+++ b/tests/conversions.c
@@ -55,14 +55,13 @@
   };
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   putenv ("BABL_DEBUG_CONVERSIONS" "=" "1");
   putenv ("BABL_DEBUG_MISSING" "=" "1");
   babl_init ();
   
-  for (int i = 0; i < sizeof (fishes)/sizeof(fishes[0]);i ++)
+  for (size_t i = 0; i < sizeof (fishes)/sizeof(fishes[0]);i ++)
   {
     babl_fish (babl_format (fishes[i].from_format),
                babl_format (fishes[i].to_format));
diff --git a/tests/extract.c b/tests/extract.c
index ffd9f5c..06f143e 100644
--- a/tests/extract.c
+++ b/tests/extract.c
@@ -24,8 +24,7 @@
 #include "common.inc"
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   int OK = 1;
   babl_init ();
diff --git a/tests/float-to-8bit.c b/tests/float-to-8bit.c
index 1a3cafc..01d5997 100644
--- a/tests/float-to-8bit.c
+++ b/tests/float-to-8bit.c
@@ -26,8 +26,7 @@
 
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   int OK = 1;
 
diff --git a/tests/floatclamp.c b/tests/floatclamp.c
index 7960e13..7878e2e 100644
--- a/tests/floatclamp.c
+++ b/tests/floatclamp.c
@@ -26,8 +26,7 @@
 
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   int OK = 1;
   babl_init ();
diff --git a/tests/format_with_space.c b/tests/format_with_space.c
index 34f5332..c26384c 100644
--- a/tests/format_with_space.c
+++ b/tests/format_with_space.c
@@ -96,8 +96,7 @@ test3 (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (test2 ())
diff --git a/tests/grayscale_to_rgb.c b/tests/grayscale_to_rgb.c
index c3ec7c7..cd4063e 100644
--- a/tests/grayscale_to_rgb.c
+++ b/tests/grayscale_to_rgb.c
@@ -72,8 +72,7 @@ test (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (test ())
diff --git a/tests/hsl.c b/tests/hsl.c
index aaa1855..54659cf 100644
--- a/tests/hsl.c
+++ b/tests/hsl.c
@@ -23,8 +23,7 @@
 
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   int OK = 1;
 
diff --git a/tests/hsva.c b/tests/hsva.c
index 66b93b8..c2a224b 100644
--- a/tests/hsva.c
+++ b/tests/hsva.c
@@ -32,8 +32,7 @@
 
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   int OK = 1;
 
diff --git a/tests/meson.build b/tests/meson.build
index 56f5812..c8f0e51 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -25,6 +25,7 @@ test_names = [
   'transparent',
   'alpha_symmetric_transform',
   'types',
+  'xyz_to_lab'
 ]
 if platform_unix
   test_names += [
@@ -41,7 +42,7 @@ foreach test_name : test_names
     test_name + '.c',
     include_directories: [rootInclude, bablInclude],
     link_with: babl,
-    dependencies: thread,
+    dependencies: [thread, lcms],
     export_dynamic: true,
     install: false,
   )
diff --git a/tests/n_components.c b/tests/n_components.c
index afe5d25..9cb8935 100644
--- a/tests/n_components.c
+++ b/tests/n_components.c
@@ -105,8 +105,7 @@ test (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (test ())
diff --git a/tests/n_components_cast.c b/tests/n_components_cast.c
index 86c6437..f654aa7 100644
--- a/tests/n_components_cast.c
+++ b/tests/n_components_cast.c
@@ -24,8 +24,7 @@
 #include "common.inc"
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   int OK = 1;
   babl_init ();
diff --git a/tests/nop.c b/tests/nop.c
index 0ea6fe0..1d8bbd2 100644
--- a/tests/nop.c
+++ b/tests/nop.c
@@ -20,8 +20,7 @@
 #include "babl.h"
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   babl_exit ();
diff --git a/tests/palette-concurrency-stress-test.c b/tests/palette-concurrency-stress-test.c
index a42b15d..9dee768 100644
--- a/tests/palette-concurrency-stress-test.c
+++ b/tests/palette-concurrency-stress-test.c
@@ -52,8 +52,7 @@ thread_proc (void *data)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   const Babl    *pal;
   const Babl    *pal_format;
diff --git a/tests/palette.c b/tests/palette.c
index 92651d0..3fd64c4 100644
--- a/tests/palette.c
+++ b/tests/palette.c
@@ -24,8 +24,7 @@
 #include "common.inc"
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   int OK = 1;
   babl_init ();
diff --git a/tests/rgb_to_bgr.c b/tests/rgb_to_bgr.c
index 4e63222..3759673 100644
--- a/tests/rgb_to_bgr.c
+++ b/tests/rgb_to_bgr.c
@@ -79,8 +79,7 @@ test (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (test ())
diff --git a/tests/rgb_to_ycbcr.c b/tests/rgb_to_ycbcr.c
index c02f743..6022211 100644
--- a/tests/rgb_to_ycbcr.c
+++ b/tests/rgb_to_ycbcr.c
@@ -86,8 +86,7 @@ test (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (test ())
diff --git a/tests/sanity.c b/tests/sanity.c
index 28158ee..b84c299 100644
--- a/tests/sanity.c
+++ b/tests/sanity.c
@@ -20,8 +20,7 @@
 #include "babl-internal.h"
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (!babl_sanity ())
diff --git a/tests/srgb_to_lab_u8.c b/tests/srgb_to_lab_u8.c
index b99538f..f8b04fc 100644
--- a/tests/srgb_to_lab_u8.c
+++ b/tests/srgb_to_lab_u8.c
@@ -66,8 +66,7 @@ test (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   if (test ())
diff --git a/tests/transparent.c b/tests/transparent.c
index 7cbc6ea..dd66366 100644
--- a/tests/transparent.c
+++ b/tests/transparent.c
@@ -45,8 +45,7 @@ clear_fish_db (void)
 }
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   int OK = 1;
   int i;
diff --git a/tests/xyz_to_lab.c b/tests/xyz_to_lab.c
new file mode 100644
index 0000000..c8e0b81
--- /dev/null
+++ b/tests/xyz_to_lab.c
@@ -0,0 +1,74 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2005, Øyvind Kolås.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <https://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <babl/babl.h>
+#include <stdio.h>
+
+#define PIXELS       4
+#define TOLERANCE    0.05
+
+float source_buf [PIXELS * 3] =
+{ 0.950, 1.000, 1.089,
+  1.000, 0.000, 0.000,
+  0.000, 1.000, 0.000,
+  0.000, 0.000, 1.000
+};
+
+float reference_buf [PIXELS * 3] =
+{ 100.00, -2.467186, -19.400648,
+  0.0, 437.147125, 0.0,
+  100.0, -431.034485, 172.4137,
+  0.0, 0.0, -185.6406,
+};
+
+float destination_buf [PIXELS * 3];
+
+static int
+test (void)
+{
+  int i;
+  int OK = 1;
+
+  babl_process (babl_fish ("CIE XYZ float", "CIE Lab float"),
+                source_buf, destination_buf,
+                PIXELS);
+
+  for (i = 0; i < PIXELS * 3; i++)
+    {
+      if (fabs (1.0 * destination_buf[i] - reference_buf[i]) > TOLERANCE)
+        {
+          fprintf (stderr, "%2i (component: %2i%%3=%i, test no: %2i/3=%i) is %f should be %f\n",
+                    i, i, i % 3, i, i / 3, destination_buf[i], reference_buf[i]);
+          OK = 0;
+        }
+    }
+  if (!OK)
+    return -1;
+  return 0;
+}
+
+int
+main (void)
+{
+  babl_init ();
+  if (test ())
+    return -1;
+  babl_exit ();
+  return 0;
+}
diff --git a/tools/babl-benchmark.c b/tools/babl-benchmark.c
index 1fd9e78..2310185 100644
--- a/tools/babl-benchmark.c
+++ b/tools/babl-benchmark.c
@@ -25,15 +25,28 @@
 #define random  rand
 #endif
 
-int ITERATIONS = 1;
-#define  N_PIXELS (512*1024)  // a too small batch makes the test set live
+#ifdef _WIN32
+/* On Windows setenv() does not exist, using _putenv_s() instead. The overwrite
+ * arg is ignored (i.e. same as always 1).
+ */
+#define setenv(name,value,overwrite) _putenv_s(name, value)
+#endif
+
+int ITERATIONS = 4;
+#define  N_PIXELS (1024*1024)  // a too small batch makes the test set live
                                // in l2 cache skewing results
 
                                // we could also add a cache purger..
 
+int unit_pixels = 1; // use megapixels per second instead of bytes
+
+int global_relative_scale = 1;
 
+int exclude_identity = 1;
 #define  N_BYTES  N_PIXELS * (4 * 8)
 
+#define BAR_WIDTH 40
+
 static const char *
 unicode_hbar (int    width, 
               double fraction)
@@ -61,122 +74,328 @@ unicode_hbar (int    width,
   return ret;
 }
 
+int show_details = 0;
+int progress = 1;
+
+#include <stdio.h>
+#include <stdint.h>
+
+#if 0
+ // more accurate, the 2100 constant is roughly
+ // what is needed on my system to convert to 1.5ghz
+ // constant clock performance
+
+inline uint64_t bench_ticks (void) {
+    uint32_t lo, hi;
+    __asm__ __volatile__ (
+      "xorl %%eax, %%eax\n"
+      "cpuid\n"
+      "rdtsc\n"
+      : "=a" (lo), "=d" (hi)
+      :
+      : "%ebx", "%ecx");
+    return ((uint64_t)hi << 32 | lo) / 2100;
+}
+#else
+static inline uint64_t bench_ticks (void) { return babl_ticks();}
+#endif
+
+#if 0
+main()
+{
+    unsigned long long x;
+    unsigned long long y;
+    x = rdtsc();
+    printf("%lld\n",x);
+    y = rdtsc();
+    printf("%lld\n",y);
+    printf("it took this long to call printf: %lld\n",y-x);
+}
+#endif
+
 static int
-test (void)
+test (int set_no)
 {
   int i, j;
   int OK = 1;
 
+  //printf("\e[3g");
+  //printf("     \eH       \eH    \eH      \eH ");
+
   char *src_data = babl_malloc (N_BYTES);
   char *dst_data = babl_malloc (N_BYTES);
-  double sum = 0;
+
+#define default_set(space, out_space) \
+       babl_format_with_space("RGBA float", babl_space(space)), \
+       babl_format_with_space("RaGaBaA float", babl_space(space)), \
+       babl_format_with_space("R'G'B'A float", babl_space(space)), \
+       babl_format_with_space("RGBA u16", babl_space(out_space)), \
+       babl_format_with_space("R'G'B'A u8", babl_space(out_space)) 
+
+#define cmyk_set(space, out_space) \
+       babl_format_with_space("cmykA float", babl_space(space)), \
+       babl_format_with_space("camayakaA float", babl_space(space)), \
+       babl_format_with_space("RGBA u16", babl_space(out_space)), \
+       babl_format_with_space("R'G'B'A u8", babl_space(out_space)) 
+
+  const Babl **formats=NULL;
+  const Babl *format_sets[][20]={
+        { babl_format_with_space("R'G'B' u8", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("R'G'B' u16", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("R'G'B' half", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("R'G'B' float", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("RGB u8", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("RGB u16", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("RGB half", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("RGB float", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("R'G'B'A u8", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("R'G'B'A u16", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("R'G'B'A half", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("R'G'B'A float", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("RGBA u8", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("RGBA u16", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("RGBA half", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("RGBA float", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y u8", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y u16", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y half", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y float", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y' u8", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y' u16", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y' half", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y' float", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y'A u8", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y'A u16", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y'A half", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("Y'A float", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("YA u8", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("YA u16", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("YA half", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("YA float", babl_space("sRGB")), default_set("sRGB", "sRGB"), NULL },
 
 
-  const Babl *formats[]={
-#if 0
-     babl_format("R'G'B'A u8"),
-     babl_format("Y float"),
-     babl_format("R'G'B'A u16"),
-     babl_format_with_space("RGBA float",     babl_space("ProPhoto")),
-     babl_format_with_space("R'G'B' u16",     babl_space("ProPhoto")),
-#endif
-       //babl_format("R'G'B'A u8"),
-       babl_format("R'G'B'A u16"),
-       //babl_format_with_space("R'G'B'A u8", babl_space("ProPhoto")),
-       //babl_format_with_space("Y'A u8", babl_space("ProPhoto")),
-       babl_format_with_space("Y'A u16", babl_space("ProPhoto")),
-       babl_format_with_space("Y' u16", babl_space("ProPhoto")),
-       //babl_format_with_space("Y' u8", babl_space("ProPhoto")),
-       babl_format_with_space("Y float", babl_space("ProPhoto")),
-       babl_format_with_space("YaA float", babl_space("ProPhoto")),
-       babl_format_with_space("YA float", babl_space("ProPhoto")),
-       //babl_format_with_space("YA u16", babl_space("ProPhoto")),
-       //babl_format_with_space("R'G'B'A half", babl_space("ProPhoto")),
-       babl_format_with_space("R'G'B'A float", babl_space("ProPhoto")),
-       babl_format_with_space("RaGaBaA float", babl_space("ProPhoto")),
-       babl_format_with_space("cairo-RGB24", babl_space("Adobe")),
-       babl_format_with_space("cairo-ARGB32", babl_space("Adobe")),
+
+
+
+        { babl_format_with_space("R'G'B' u8", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B' u16", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B' half", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B' float", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("RGB u8", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("RGB u16", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("RGB half", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("RGB float", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B'A u8", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B'A u16", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B'A half", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B'A float", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("RGBA u8", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("RGBA u16", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("RGBA half", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("RGBA float", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y u8", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y u16", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y half", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y float", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y' u8", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y' u16", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y' half", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y' float", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y'A u8", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y'A u16", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y'A half", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("Y'A float", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("YA u8", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("YA u16", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("YA half", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+        { babl_format_with_space("YA float", babl_space("ProPhoto")), default_set("ProPhoto", "Rec2020"), NULL },
+
+        { babl_format_with_space("R'G'B' u8", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B' u16", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B' half", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B' float", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("RGB u8", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("RGB u16", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("RGB half", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("RGB float", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B'A u8", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B'A u16", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B'A half", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("R'G'B'A float", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("RGBA u8", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("RGBA u16", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("RGBA half", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("RGBA float", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y u8", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y u16", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y half", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y float", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y' u8", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y' u16", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y' half", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y' float", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y'A u8", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y'A u16", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y'A half", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("Y'A float", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("YA u8", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("YA u16", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("YA half", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        { babl_format_with_space("YA float", babl_space("sRGB")), default_set("sRGB", "Rec2020"), NULL },
+        
+        
+        { babl_format_with_space("CMYKA float", babl_space("sRGB")), cmyk_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("CMYKA u16", babl_space("sRGB")), cmyk_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("CMYKA u8", babl_space("sRGB")), cmyk_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("cmykA float", babl_space("sRGB")), cmyk_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("cmykA u16", babl_space("sRGB")), cmyk_set("sRGB", "sRGB"), NULL },
+        { babl_format_with_space("cmykA u8", babl_space("sRGB")), cmyk_set("sRGB", "sRGB"), NULL },
 
      };
-  int n_formats = sizeof (formats) / sizeof (formats[0]);
+
+  int n_formats = 0;
+  int n_sets = sizeof(format_sets)/sizeof(format_sets[0]);
+
   const Babl *fishes[50 * 50];
   double mbps[50 * 50] = {0,};
   long n;
+
+  int set_iter = 0;
+  int first_run = 1;
+  float max_throughput = 0;
   double max = 0.0;
 
-  assert (n_formats < 50);
+  if (set_no > n_sets-1) set_no = n_sets-1;
+
+  while (set_iter < n_sets)
+  {
+  double sum = 0;
+          n_formats = 0;
+  if (set_no >= 0)
+    formats=&format_sets[set_no][0];
+  else
+    formats=&format_sets[set_iter][0];
+
 
  for (i = 0; i < N_BYTES; i++)
    src_data[i] = random();
 
+ fprintf (stdout, "\n\n");
+ //fprintf (stdout, "set %i:\n", set_iter);
+ for (i = 0; formats[i]; i++)
+ {
+  // fprintf (stdout, "  %s\n", babl_get_name (formats[i]));
+   n_formats++;
+ }
+ //fprintf (stdout, "\n");
 
- fprintf (stdout,"%i iterations of %i pixels, mb/s is for sum of source and destinations bytes\n", ITERATIONS, N_PIXELS);
+ //fprintf (stdout,"%i iterations of %i pixels, mp/s is for sum of source and destinations bytes\n", ITERATIONS, N_PIXELS);
+ 
 
  n = 0;
- for (i = 0; i < n_formats; i++)
-   for (j = 0; j < n_formats; j++)
-   if (i != j)
+ for (i = 0; formats[i]; i++)
+   for (j = 0; formats[j]; j++)
+   //if (i != j && i != (n_formats - 1) && (i==0 || j!=n_formats-1))
+   if (i != j && i != (n_formats - 1) && (i==0 || j!=n_formats-1) && (j==0 || i==0) && (!exclude_identity || formats[i] != formats[j]))
    {
       const Babl *fish = babl_fish (formats[i], formats[j]);
       long end, start;
       int iters = ITERATIONS;
+      if (progress)
+      fprintf (stderr, "%s to %s               \r", babl_get_name (formats[i]),
+                                                   babl_get_name (formats[j]));
 
-      fprintf (stderr, "%s to %s          \r", babl_get_name (formats[i]),
-                                               babl_get_name (formats[j]));
-      fflush (0);
-
-      /* a quarter round of warmup */
-      babl_process (fish, src_data, dst_data, N_PIXELS * 0.25);
-      start = babl_ticks ();
+      /* a round of warmup */
+      babl_process (fish, src_data, dst_data, N_PIXELS/4);
+      start = bench_ticks ();
       while (iters--)
       {
         babl_process (fish, src_data, dst_data, N_PIXELS);
       }
-      end = babl_ticks ();
+      end = bench_ticks ();
       fishes[n] = fish;
-      mbps [n] = (babl_format_get_bytes_per_pixel (formats[i]) +
-                           babl_format_get_bytes_per_pixel (formats[j])) *
-              (N_PIXELS * ITERATIONS / 1024.0 / 1024.0) / ((end-start)/(1000.0*1000.0));
+      mbps [n] = (N_PIXELS * ITERATIONS / 1000.0 / 1000.0) / ((end-start)/(1000.0*1000.0));
+      if (!unit_pixels)
+        mbps [n] *= (babl_format_get_bytes_per_pixel (formats[i]) +
+		      babl_format_get_bytes_per_pixel (formats[j]));
 
       sum += mbps[n];
-      if (mbps[n] > max)
+#if 1
+      if (mbps[n] > max && first_run)
         max = mbps[n];
+#endif
       n++;
    }
 
+  if (progress)
+  fprintf (stderr, "                                                       \r");
+
+  {
+  float throughput  = sum / n;
+  if (throughput > max_throughput)
+     max_throughput = throughput;
+  fprintf (stdout, "%s %03.3f mp/s\tWorkflow: %s to %s\n\n",
+                      unicode_hbar(BAR_WIDTH, throughput / max_throughput), throughput,
+                      babl_get_name (formats[0]),
+                      babl_get_name (formats[n_formats-1]));
+  }
+
+      if (mbps[n] > max && first_run)
+        max = mbps[n];
+
+  if (global_relative_scale) max = max_throughput;
+
  n = 0;
- for (i = 0; i < n_formats; i++)
-   for (j = 0; j < n_formats; j++)
-   if (i != j)
+ for (i = 0; formats[i]; i++)
+   for (j = 0; formats[j]; j++)
+   //if (i != j && i != (n_formats - 1) && (i==0 || j!=n_formats-1))
+   if (i != j && i != (n_formats - 1) && (i==0 || j!=n_formats-1) && (j==0 || i==0) && (!exclude_identity || formats[i] != formats[j]))
    {
-      fprintf (stdout, "%s %03.1f mb/s\t%s to %s %.9f",
-                      unicode_hbar(16, mbps[n] / max),
+      fprintf (stdout, "%s %03.3f m%s/s\t",
+                      unicode_hbar(BAR_WIDTH, mbps[n] / max),
                       mbps[n],
-                      babl_get_name (formats[i]),
-                      babl_get_name (formats[j]),
-                      fishes[n]->fish.error);
+		      unit_pixels?"p":"b");
+
+
       if (fishes[n]->class_type == BABL_FISH_REFERENCE)
       {
-        fprintf (stdout, "[R]");
+        fprintf (stdout, "REF ");
       }
       else if (fishes[n]->class_type == BABL_FISH_PATH)
       {
-        int k;
-        //fprintf (stdout, "[%d]", fishes[n]->fish_path.conversion_list->count);
-        for (k = 0; k < fishes[n]->fish_path.conversion_list->count; k++)
-        {
-          fprintf (stdout, "\n\t\t\t\t%s", babl_get_name (
-                   fishes[n]->fish_path.conversion_list->items[k]));
-        }
+#if BABL_MICRO_VERSION>=89
+        if (fishes[n]->fish_path.u8_lut)
+          fprintf (stdout, "LUT ");
+        else
+
+#endif
+        fprintf (stdout, "  %d ", fishes[n]->fish_path.conversion_list->count);
+      }
+
+      fprintf (stdout, "%.9f %s to %s ",
+                      fishes[n]->fish.error,
+                      babl_get_name (formats[i]),
+                      babl_get_name (formats[j]));
+
+      if (fishes[n]->class_type == BABL_FISH_PATH && show_details)
+      {
+      for (int k = 0; k < fishes[n]->fish_path.conversion_list->count; k++)
+      {
+        fprintf (stdout, "\n  %s", babl_get_name (
+                 fishes[n]->fish_path.conversion_list->items[k]));
+      }
+      fprintf (stdout, "\n");
       }
       fprintf (stdout, "\n");
       n++;
    }
-  fprintf (stdout, "\n%s %03.1f mb/s\taverage\n",
-                      unicode_hbar(16, sum / (n_formats * n_formats - n_formats) / max),
-                      sum / (n_formats * n_formats - n_formats));
 
   fflush (0);
+  set_iter++;
+  first_run = 0;
+  if (set_no>=0)
+          return !OK;
+  }
 
   if (!OK)
     return -1;
@@ -187,10 +406,22 @@ int
 main (int    argc,
       char **argv)
 {
-  if (argv[1]) ITERATIONS = atoi (argv[1]);
+  //if (argv[1]) ITERATIONS = atoi (argv[1]);
+  setenv ("BABL_INHIBIT_CACHE", "1", 1);
   babl_init ();
-  if (test ())
-    return -1;
+  if (argv[1] && argv[2]) show_details = 1;
+  if (argv[1])
+  {
+    if (test (atoi(argv[1])))
+      return -1;
+ // if (test (atoi(argv[1])))
+ //   return -1;
+  }
+  else
+  {
+    test (-1);
+//  test (-1);
+  }
   babl_exit ();
   return 0;
 }
diff --git a/tools/babl-lut-verify.c b/tools/babl-lut-verify.c
new file mode 100644
index 0000000..c9f9616
--- /dev/null
+++ b/tools/babl-lut-verify.c
@@ -0,0 +1,286 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include "config.h"
+#include <math.h>
+#include "babl-internal.h"
+
+#define PIXELS 127*256 //less than threshold for generating
+
+#ifndef HAVE_SRANDOM
+#define random rand
+#endif
+
+#ifdef _WIN32
+/* On Windows setenv() does not exist, using _putenv_s() instead. The overwrite
+ * arg is ignored (i.e. same as always 1).
+ */
+#define setenv(name,value,overwrite) _putenv_s(name, value)
+#endif
+
+static double
+test_generic (const Babl *source, const Babl *dest)
+{
+  uint8_t *src = malloc (PIXELS*16);
+  uint8_t *dst = malloc (PIXELS*16);
+  uint8_t *dst2 = malloc (PIXELS*16);
+  uint8_t *dstb = malloc (PIXELS*16);
+  uint8_t *dst2b = malloc (PIXELS*16);
+  double error = 0.0;
+
+  for (int i = 0; i < PIXELS * 16; i++)
+      src[i] = random();
+
+  babl_process ( babl_fish (source, dest), src, dst, PIXELS);
+  babl_process ( babl_fish (source, dest), src, dst2, PIXELS);
+  babl_process ( babl_fish (source, dest), src, dst2, PIXELS);
+  babl_process ( babl_fish (source, dest), src, dst2, PIXELS);
+  babl_process ( babl_fish (dest, babl_format_with_space ("R'G'B'A u8", dest)), dst2, dst2b, PIXELS);
+  babl_process ( babl_fish (dest, babl_format_with_space ("R'G'B'A u8", dest)), dst, dstb, PIXELS);
+
+  for (int i = 0; i < PIXELS; i++)
+  {
+    error += sqrt ((dst[i*4+0] - dst2b[i*4+0])*
+                   (dstb[i*4+0] - dst2b[i*4+0])+
+                   (dstb[i*4+1] - dst2b[i*4+1])*
+                   (dstb[i*4+1] - dst2b[i*4+1])+
+                   (dstb[i*4+2] - dst2b[i*4+2])*
+                   (dstb[i*4+2] - dst2b[i*4+2]));
+  }
+
+  free (src);
+  free (dst);
+  free (dst2);
+  free (dstb);
+  free (dst2b);
+
+  return error;
+}
+
+int main (void)
+{
+  double error = 0;
+  setenv ("BABL_INHIBIT_CACHE", "1", 1);
+  babl_init ();
+  {
+          
+  const Babl *format_sets[][2]={
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("R'G'B' u8", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("R'G'B'A u8", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("R'G'B'A half", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("R'G'B'A u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("R'G'B'A float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y'A float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y'A u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y'A u8", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y' float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y' u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B' u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y' u8", babl_space("Rec2020"))
+          },
+
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("R'G'B'A u8", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("R'G'B'A half", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("R'G'B'A u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("R'G'B'A float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y'A float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y'A u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y'A half", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y'A u8", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y' float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y' u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("R'G'B'A u8", babl_space("ProPhoto")),
+           babl_format_with_space ("Y' u8", babl_space("Rec2020"))
+          },
+
+          {
+           babl_format_with_space ("YaA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A u8", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YaA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A half", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YaA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YaA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YaA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y'A float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YaA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y'A u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YaA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y'A u8", babl_space("Rec2020"))
+          },
+
+          {
+           babl_format_with_space ("YaA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A u8", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YaA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A half", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y'A float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y'A u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y'A u8", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y' float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y' u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("YA half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y' u8", babl_space("Rec2020"))
+          },
+
+
+          {
+           babl_format_with_space ("Y half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A u8", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("Y half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A half", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("Y half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("Y half", babl_space("ProPhoto")), 
+           babl_format_with_space ("R'G'B'A float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("Y half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y'A float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("Y half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y'A u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("Y half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y'A u8", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("Y half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y' float", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("Y half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y' u16", babl_space("Rec2020"))
+          },
+          {
+           babl_format_with_space ("Y half", babl_space("ProPhoto")), 
+           babl_format_with_space ("Y' u8", babl_space("Rec2020"))
+          }
+
+  };
+
+
+  for (size_t i = 0; i < sizeof (format_sets) / sizeof(format_sets[0]); i++)
+  {
+    fprintf (stdout, "%s to %s: ", babl_get_name (format_sets[i][0]),
+                                  babl_get_name (format_sets[i][1])),
+    error = test_generic (format_sets[i][0], format_sets[i][1]);
+    if (error != 0.0)
+      fprintf (stdout, "%.20f\n", error/(PIXELS*4));
+    else
+      fprintf (stdout, "OK\n");
+  }
+  }
+
+  babl_exit ();
+  return 0;
+}
diff --git a/tools/babl-verify.c b/tools/babl-verify.c
index de3bbd1..b76f707 100644
--- a/tools/babl-verify.c
+++ b/tools/babl-verify.c
@@ -4,11 +4,23 @@
 #include "babl/babl-internal.h"
 
 //#define SPACE1 babl_space("ProPhoto")
-#define SPACE1 babl_space("Apple")
-//#define SPACE1 babl_space("sRGB")
+//#define SPACE1 babl_space("Apple")
+#define SPACE1 babl_space("sRGB")
 //#define SPACE2 babl_space("Apple")
 
-static int
+#ifdef _WIN32
+/* On Windows setenv() does not exist, using _putenv_s() instead. The overwrite
+ * arg is ignored (i.e. same as always 1).
+ */
+#define setenv(name,value,overwrite) _putenv_s(name, value)
+#endif
+
+int
+file_get_contents (const char  *path,
+                         char       **contents,
+                         long        *length,
+                         void        *error);
+int
 file_get_contents (const char  *path,
                          char       **contents,
                          long        *length,
@@ -62,7 +74,7 @@ main (int    argc,
   int final = 0;
   const Babl *fish;
   const Babl *SPACE2 = NULL;
-
+  setenv ("BABL_INHIBIT_CACHE", "1", 1);
 
   if (argc < 3)
   {
@@ -80,7 +92,7 @@ main (int    argc,
 
   babl_init ();
 
-#define ICC_PATH "/tmp/my.icc"
+//#define ICC_PATH "/tmp/my.icc"
 //#define ICC_PATH "/usr/share/color/icc/colord/AppleRGB.icc"
 //#define ICC_PATH "/tmp/ACEScg-elle-V2-labl.icc"
 //#define ICC_PATH "/tmp/ACEScg-elle-V2-g10.icc"
@@ -89,11 +101,11 @@ main (int    argc,
 
 
   {
-    char *icc_data = NULL;
-    long     length = 0;
-    file_get_contents (ICC_PATH, &icc_data, &length, NULL);
-    SPACE2 = babl_space_from_icc (icc_data, length, BABL_ICC_INTENT_RELATIVE_COLORIMETRIC, NULL);
-    //SPACE2 = babl_space ("sRGB");
+    //char *icc_data = NULL;
+    //long     length = 0;
+    //file_get_contents (ICC_PATH, &icc_data, &length, NULL);
+    //SPACE2 = babl_space_from_icc (icc_data, length, BABL_ICC_INTENT_RELATIVE_COLORIMETRIC, NULL);
+    SPACE2 = babl_space ("sRGB");
   }
 
   fish = babl_fish (babl_format_with_space(argv[1], SPACE1), babl_format_with_space (argv[2], SPACE2));
diff --git a/tools/babl_fish_path_fitness.c b/tools/babl_fish_path_fitness.c
index 9b0cfc5..8a955be 100644
--- a/tools/babl_fish_path_fitness.c
+++ b/tools/babl_fish_path_fitness.c
@@ -79,8 +79,8 @@ static void
 init_test_pixels (void)
 {
   static int done = 0;
-  int i = 0;
-  int pix_no = 0;
+  size_t i = 0;
+  size_t pix_no = 0;
   srandom (111);
 
   if (done)
diff --git a/tools/introspect.c b/tools/introspect.c
index 5cd667f..26a32d8 100644
--- a/tools/introspect.c
+++ b/tools/introspect.c
@@ -20,8 +20,7 @@
 #include "babl-internal.h"
 
 int
-main (int    argc,
-      char **argv)
+main (void)
 {
   babl_init ();
   babl_introspect (NULL);
diff --git a/tools/meson.build b/tools/meson.build
index 12180de..89ccf40 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -1,6 +1,7 @@
 
 tool_names = [
   'babl_fish_path_fitness',
+  'babl-lut-verify',
   'babl-benchmark',
   'babl-html-dump',
   'babl-icc-dump',
@@ -17,7 +18,7 @@ foreach tool_name : tool_names
     tool_name + '.c',
     include_directories: [rootInclude, bablInclude],
     link_with: babl,
-    dependencies: [math, thread],
+    dependencies: [math, thread, lcms],
     install: false,
   )
 
diff --git a/tools/trc-validator.c b/tools/trc-validator.c
index 7bf1eaa..2e03381 100644
--- a/tools/trc-validator.c
+++ b/tools/trc-validator.c
@@ -185,8 +185,7 @@ to_linear_ref (float x)
 }
 
 int 
-main (int    argc, 
-      char **argv)
+main (void)
 {
   int i;
   float max_diff = 0.0;
author	Jeremy Bícha <jbicha@ubuntu.com>	2023-06-12 18:06:10 +0100
committer	Jeremy Bícha <jbicha@ubuntu.com>	2023-06-12 18:06:10 +0100
commit	b996e42137121c616e778befb4aab16bfc633d7b (patch)
tree	d2ed9b4a33ee787f2dc82964c43d45447766c7d6
parent	ca2ad4d784f40fc6a4afde8ec0b46cb92ad72176 (diff)
parent	6c53911389104733e23c52f55d002d11ed6b9458 (diff)