From 1eaef2065fd65c2a926f031d80577235593106f2 Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Fri, 1 Jun 2018 21:01:58 -0400 Subject: Add missing subpel variance functions for x86. Add the by 128 functions that were missing. While we are at it, fill out rectangular avx2 functions. Change-Id: If990ce92d4c23d6225cd11d3815d600e819a8e2c --- aom_dsp/aom_dsp_rtcd_defs.pl | 38 +++++---- aom_dsp/x86/variance_avx2.c | 138 ++++++++++++++++-------------- aom_dsp/x86/variance_sse2.c | 199 ++++++++++++++++++++++--------------------- 3 files changed, 197 insertions(+), 178 deletions(-) (limited to 'aom_dsp') diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 9ba5496f7..670e34a6e 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -950,11 +950,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_variance4x8 sse2 msa/; specialize qw/aom_variance4x4 sse2 msa/; + specialize qw/aom_sub_pixel_variance128x128 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance128x64 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x128 avx2 sse2 ssse3/; specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x64 avx2 msa sse2 ssse3/; specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x16 avx2 msa sse2 ssse3/; specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/; specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/; specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/; @@ -964,19 +967,22 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/; specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance128x128 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance128x64 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x128 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x64 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x16 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; specialize qw/aom_variance4x16 sse2/; specialize qw/aom_variance16x4 sse2 avx2/; diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c index 3cf55323a..7d6b7d287 100644 --- a/aom_dsp/x86/variance_avx2.c +++ b/aom_dsp/x86/variance_avx2.c @@ -240,71 +240,83 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2( const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, int height, unsigned int *sseptr); -unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - unsigned int sse1; - const int se1 = aom_sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); - unsigned int sse2; - const int se2 = - aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, - dst + 32, dst_stride, 64, &sse2); - const int se = se1 + se2; - unsigned int variance; - *sse = sse1 + sse2; - - variance = *sse - (uint32_t)(((int64_t)se * se) >> 12); - _mm256_zeroupper(); - return variance; -} - -unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - const int se = aom_sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); - - const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10); - _mm256_zeroupper(); - return variance; -} +#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } -unsigned int aom_sub_pixel_avg_variance64x64_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { - unsigned int sse1; - const int se1 = aom_sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); - unsigned int sse2; - const int se2 = aom_sub_pixel_avg_variance32xh_avx2( - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, - 64, 64, &sse2); - const int se = se1 + se2; - unsigned int variance; - - *sse = sse1 + sse2; - - variance = *sse - (uint32_t)(((int64_t)se * se) >> 12); - _mm256_zeroupper(); - return variance; -} +AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7); +AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6); +AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7); +AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6); +AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5); +AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6); +AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5); +AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4); + +#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } -unsigned int aom_sub_pixel_avg_variance32x32_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { - // Process 32 elements in parallel. - const int se = aom_sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); - - const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10); - _mm256_zeroupper(); - return variance; -} +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7); +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4); static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { const __m256i d = diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c index 4ef1682c2..c8c90a7dc 100644 --- a/aom_dsp/x86/variance_sse2.c +++ b/aom_dsp/x86/variance_sse2.c @@ -344,57 +344,56 @@ DECLS(ssse3); #undef DECLS #undef DECL -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ - unsigned int sse; \ - int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - h, &sse, NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ - FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ - FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ - FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ - FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) FNS(sse2); @@ -420,58 +419,60 @@ DECLS(ssse3); #undef DECL #undef DECLS -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sseptr, \ - const uint8_t *sec) { \ - unsigned int sse; \ - int se = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ - NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sseptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ - FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ - FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ - FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ - FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) FNS(sse2); -- cgit v1.2.3