/* babl - dynamically extendable universal pixel conversion library. * Copyright (C) 2019 Ell * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 3 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General * Public License along with this library; if not, see * . */ #include "config.h" #if defined(USE_AVX2) /* AVX 2 */ #include #include #include #include "babl.h" #include "babl-cpuaccel.h" #include "extensions/util.h" #include "extensions/avx2-int8-tables.h" #define TABLE_SIZE (sizeof (linear_to_gamma) / sizeof (linear_to_gamma[0])) #define SCALE ((float) (TABLE_SIZE - 1)) #define CVT1(src, dst) \ do \ { \ float x = *src; \ \ if (x < 0.0f) \ *dst = 0; \ else if (x <= 1.0f) \ *dst = linear_to_gamma[(int) (SCALE * x + 0.5f)]; \ else /* x > 1.0f || isnan (x) */ \ *dst = 255; \ \ src++; \ dst++; \ } \ while (0) #define CVTA1(src, dst) \ do \ { \ float x = *src; \ \ if (x < 0.0f) \ *dst = 0; \ else if (x <= 1.0f) \ *dst = 255.0f * x + 0.5f; \ else /* x > 1.0f || isnan (x) */ \ *dst = 255; \ \ src++; \ dst++; \ } \ while (0) static inline void conv_yF_linear_y8_gamma (const Babl *conversion, const float *src, uint8_t *dst, long samples) { const __v8sf *src_vec; __m256i *dst_vec; const __v8sf scale = _mm256_set1_ps (SCALE); const __v8sf zero = _mm256_setzero_ps (); const __v8sf half = _mm256_set1_ps (0.5f); while ((uintptr_t) src % 32 && samples > 0) { CVT1 (src, dst); samples--; } src_vec = (const __v8sf *) src; dst_vec = (__m256i *) dst; while (samples >= 32) { __m256i i32_0, i32_1, i32_2, i32_3; __m256i i16_01, i16_23; __m256i i8_0123; #define CVT8(i) \ do \ { \ __v8sf yyyyyyyy; \ \ yyyyyyyy = scale * src_vec[i] + half; \ yyyyyyyy = _mm256_max_ps (yyyyyyyy, zero); \ yyyyyyyy = _mm256_min_ps (yyyyyyyy, scale); \ i32_##i = _mm256_cvttps_epi32 (yyyyyyyy); \ i32_##i = _mm256_i32gather_epi32 (linear_to_gamma, i32_##i, 4); \ } \ while (0) CVT8 (0); CVT8 (1); i16_01 = _mm256_packus_epi32 (i32_0, i32_1); CVT8 (2); CVT8 (3); i16_23 = _mm256_packus_epi32 (i32_2, i32_3); i8_0123 = _mm256_packus_epi16 (i16_01, i16_23); i8_0123 = _mm256_permutevar8x32_epi32 ( i8_0123, _mm256_setr_epi32 (0, 4, 1, 5, 2, 6, 3, 7)); _mm256_storeu_si256 (dst_vec, i8_0123); #undef CVT8 src_vec += 4; dst_vec++; samples -= 32; } src = (const float *) src_vec; dst = (uint8_t *) dst_vec; while (samples > 0) { CVT1 (src, dst); samples--; } } static inline void conv_yaF_linear_ya8_gamma (const Babl *conversion, const float *src, uint8_t *dst, long samples) { if ((uintptr_t) src % 8 == 0) { const __v8sf *src_vec; __m256i *dst_vec; const __v8sf scale = _mm256_setr_ps (SCALE, 255.0f, SCALE, 255.0f, SCALE, 255.0f, SCALE, 255.0f); const __v8sf zero = _mm256_setzero_ps (); const __v8sf half = _mm256_set1_ps (0.5f); const __m256i mask = _mm256_setr_epi32 (-1, 0, -1, 0, -1, 0, -1, 0); while ((uintptr_t) src % 32 && samples > 0) { CVT1 (src, dst); CVTA1 (src, dst); samples--; } src_vec = (const __v8sf *) src; dst_vec = (__m256i *) dst; while (samples >= 16) { __m256i i32_0, i32_1, i32_2, i32_3; __m256i i16_01, i16_23; __m256i i8_0123; #define CVT8(i) \ do \ { \ __v8sf yayayaya; \ \ yayayaya = scale * src_vec[i] + half; \ yayayaya = _mm256_max_ps (yayayaya, zero); \ yayayaya = _mm256_min_ps (yayayaya, scale); \ i32_##i = _mm256_cvttps_epi32 (yayayaya); \ i32_##i = _mm256_mask_i32gather_epi32 (i32_##i, \ linear_to_gamma, \ i32_##i, mask, 4); \ } \ while (0) CVT8 (0); CVT8 (1); i16_01 = _mm256_packus_epi32 (i32_0, i32_1); CVT8 (2); CVT8 (3); i16_23 = _mm256_packus_epi32 (i32_2, i32_3); i8_0123 = _mm256_packus_epi16 (i16_01, i16_23); i8_0123 = _mm256_permutevar8x32_epi32 ( i8_0123, _mm256_setr_epi32 (0, 4, 1, 5, 2, 6, 3, 7)); _mm256_storeu_si256 (dst_vec, i8_0123); #undef CVT8 src_vec += 4; dst_vec++; samples -= 16; } src = (const float *) src_vec; dst = (uint8_t *) dst_vec; } while (samples > 0) { CVT1 (src, dst); CVTA1 (src, dst); samples--; } } static void conv_rgbF_linear_rgb8_gamma (const Babl *conversion, const float *src, uint8_t *dst, long samples) { conv_yF_linear_y8_gamma (conversion, src, dst, 3 * samples); } static inline void conv_rgbaF_linear_rgba8_gamma (const Babl *conversion, const float *src, uint8_t *dst, long samples) { if ((uintptr_t) src % 16 == 0) { const __v8sf *src_vec; __m256i *dst_vec; const __v8sf scale = _mm256_setr_ps (SCALE, SCALE, SCALE, 255.0f, SCALE, SCALE, SCALE, 255.0f); const __v8sf zero = _mm256_setzero_ps (); const __v8sf half = _mm256_set1_ps (0.5f); const __m256i mask = _mm256_setr_epi32 (-1, -1, -1, 0, -1, -1, -1, 0); while ((uintptr_t) src % 32 && samples > 0) { CVT1 (src, dst); CVT1 (src, dst); CVT1 (src, dst); CVTA1 (src, dst); samples--; } src_vec = (const __v8sf *) src; dst_vec = (__m256i *) dst; while (samples >= 8) { __m256i i32_0, i32_1, i32_2, i32_3; __m256i i16_01, i16_23; __m256i i8_0123; #define CVT8(i) \ do \ { \ __v8sf rgbargba; \ \ rgbargba = scale * src_vec[i] + half; \ rgbargba = _mm256_max_ps (rgbargba, zero); \ rgbargba = _mm256_min_ps (rgbargba, scale); \ i32_##i = _mm256_cvttps_epi32 (rgbargba); \ i32_##i = _mm256_mask_i32gather_epi32 (i32_##i, \ linear_to_gamma, \ i32_##i, mask, 4); \ } \ while (0) CVT8 (0); CVT8 (1); i16_01 = _mm256_packus_epi32 (i32_0, i32_1); CVT8 (2); CVT8 (3); i16_23 = _mm256_packus_epi32 (i32_2, i32_3); i8_0123 = _mm256_packus_epi16 (i16_01, i16_23); i8_0123 = _mm256_permutevar8x32_epi32 ( i8_0123, _mm256_setr_epi32 (0, 4, 1, 5, 2, 6, 3, 7)); _mm256_storeu_si256 (dst_vec, i8_0123); #undef CVT8 src_vec += 4; dst_vec++; samples -= 8; } src = (const float *) src_vec; dst = (uint8_t *) dst_vec; } while (samples > 0) { CVT1 (src, dst); CVT1 (src, dst); CVT1 (src, dst); CVTA1 (src, dst); samples--; } } #undef CVT1 #undef CVTA1 #define CVT1(src, dst) \ (*dst++ = gamma_to_linear[*src++]) #define CVTA1(src, dst) \ (*dst++ = gamma_to_linear[*src++ + 256]) static inline void conv_y8_gamma_yF_linear (const Babl *conversion, const uint8_t *src, float *dst, long samples) { const __m128i *src_vec; __v8sf *dst_vec; while ((uintptr_t) dst % 32 && samples > 0) { CVT1 (src, dst); samples--; } src_vec = (const __m128i *) src; dst_vec = (__v8sf *) dst; while (samples >= 16) { __m128i i8_01; __m256i i32_0; i8_01 = _mm_loadu_si128 (src_vec++); i32_0 = _mm256_cvtepu8_epi32 (i8_01); *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2)); i32_0 = _mm256_cvtepu8_epi32 (i8_01); *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); samples -= 16; } src = (const uint8_t *) src_vec; dst = (float *) dst_vec; while (samples > 0) { CVT1 (src, dst); samples--; } } static inline void conv_ya8_gamma_yaF_linear (const Babl *conversion, const uint8_t *src, float *dst, long samples) { const __m128i *src_vec; __v8sf *dst_vec; const __m256i offset = _mm256_setr_epi32 (0, 256, 0, 256, 0, 256, 0, 256); while ((uintptr_t) dst % 32 && samples > 0) { CVT1 (src, dst); CVTA1 (src, dst); samples--; } src_vec = (const __m128i *) src; dst_vec = (__v8sf *) dst; while (samples >= 8) { __m128i i8_01; __m256i i32_0; i8_01 = _mm_loadu_si128 (src_vec++); i32_0 = _mm256_cvtepu8_epi32 (i8_01); i32_0 += offset; *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2)); i32_0 = _mm256_cvtepu8_epi32 (i8_01); i32_0 += offset; *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); samples -= 8; } src = (const uint8_t *) src_vec; dst = (float *) dst_vec; while (samples > 0) { CVT1 (src, dst); CVTA1 (src, dst); samples--; } } static inline void conv_rgb8_gamma_rgbF_linear (const Babl *conversion, const uint8_t *src, float *dst, long samples) { conv_y8_gamma_yF_linear (conversion, src, dst, 3 * samples); } static inline void conv_rgba8_gamma_rgbaF_linear (const Babl *conversion, const uint8_t *src, float *dst, long samples) { const __m128i *src_vec; __v8sf *dst_vec; const __m256i offset = _mm256_setr_epi32 (0, 0, 0, 256, 0, 0, 0, 256); while ((uintptr_t) dst % 32 && samples > 0) { CVT1 (src, dst); CVT1 (src, dst); CVT1 (src, dst); CVTA1 (src, dst); samples--; } src_vec = (const __m128i *) src; dst_vec = (__v8sf *) dst; while (samples >= 4) { __m128i i8_01; __m256i i32_0; i8_01 = _mm_loadu_si128 (src_vec++); i32_0 = _mm256_cvtepu8_epi32 (i8_01); i32_0 += offset; *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2)); i32_0 = _mm256_cvtepu8_epi32 (i8_01); i32_0 += offset; *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); samples -= 4; } src = (const uint8_t *) src_vec; dst = (float *) dst_vec; while (samples > 0) { CVT1 (src, dst); CVT1 (src, dst); CVT1 (src, dst); CVTA1 (src, dst); samples--; } } #undef CVT1 #undef CVTA1 #endif /* defined(USE_AVX2) */ int init (void); int init (void) { #if defined(USE_AVX2) const Babl *yF_linear = babl_format_new ( babl_model ("Y"), babl_type ("float"), babl_component ("Y"), NULL); const Babl *y8_gamma = babl_format_new ( babl_model ("Y'"), babl_type ("u8"), babl_component ("Y'"), NULL); const Babl *yaF_linear = babl_format_new ( babl_model ("YA"), babl_type ("float"), babl_component ("Y"), babl_component ("A"), NULL); const Babl *ya8_gamma = babl_format_new ( babl_model ("Y'A"), babl_type ("u8"), babl_component ("Y'"), babl_component ("A"), NULL); const Babl *rgbF_linear = babl_format_new ( babl_model ("RGB"), babl_type ("float"), babl_component ("R"), babl_component ("G"), babl_component ("B"), NULL); const Babl *rgb8_gamma = babl_format_new ( babl_model ("R'G'B'"), babl_type ("u8"), babl_component ("R'"), babl_component ("G'"), babl_component ("B'"), NULL); const Babl *rgbaF_linear = babl_format_new ( babl_model ("RGBA"), babl_type ("float"), babl_component ("R"), babl_component ("G"), babl_component ("B"), babl_component ("A"), NULL); const Babl *rgba8_gamma = babl_format_new ( babl_model ("R'G'B'A"), babl_type ("u8"), babl_component ("R'"), babl_component ("G'"), babl_component ("B'"), babl_component ("A"), NULL); #define CONV(src, dst) \ do \ { \ babl_conversion_new (src ## _linear, \ dst ## _gamma, \ "linear", \ conv_ ## src ## _linear_ ## dst ## _gamma, \ NULL); \ \ babl_conversion_new (dst ## _gamma, \ src ## _linear, \ "linear", \ conv_ ## dst ## _gamma_ ## src ## _linear, \ NULL); \ } \ while (0) if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_AVX2)) { CONV (yF, y8); CONV (yaF, ya8); CONV (rgbF, rgb8); CONV (rgbaF, rgba8); } #endif /* defined(USE_AVX2) */ return 0; }