summaryrefslogtreecommitdiff
path: root/aom_dsp
diff options
context:
space:
mode:
authorKyle Siefring <kylesiefring@gmail.com>2018-06-01 21:01:58 -0400
committerKyle Siefring <kylesiefring@gmail.com>2018-06-09 03:35:30 +0000
commit1eaef2065fd65c2a926f031d80577235593106f2 (patch)
tree3b804b03043e20f130a883e499202e11bdc922d8 /aom_dsp
parent4efdfd9fd17cb58d022d22dddc03d35d73597a6b (diff)
Add missing subpel variance functions for x86.
Add the by 128 functions that were missing. While we are at it, fill out rectangular avx2 functions. Change-Id: If990ce92d4c23d6225cd11d3815d600e819a8e2c
Diffstat (limited to 'aom_dsp')
-rwxr-xr-xaom_dsp/aom_dsp_rtcd_defs.pl38
-rw-r--r--aom_dsp/x86/variance_avx2.c138
-rw-r--r--aom_dsp/x86/variance_sse2.c199
3 files changed, 197 insertions, 178 deletions
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 9ba5496f7..670e34a6e 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -950,11 +950,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_variance4x8 sse2 msa/;
specialize qw/aom_variance4x4 sse2 msa/;
+ specialize qw/aom_sub_pixel_variance128x128 avx2 sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance128x64 avx2 sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance64x128 avx2 sse2 ssse3/;
specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance64x32 avx2 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance32x64 avx2 msa sse2 ssse3/;
specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance32x16 avx2 msa sse2 ssse3/;
specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/;
specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/;
specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/;
@@ -964,19 +967,22 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/;
specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance128x128 avx2 sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance128x64 avx2 sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance64x128 avx2 sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance64x32 avx2 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance32x64 avx2 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance32x16 avx2 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
specialize qw/aom_variance4x16 sse2/;
specialize qw/aom_variance16x4 sse2 avx2/;
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index 3cf55323a..7d6b7d287 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -240,71 +240,83 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
int height, unsigned int *sseptr);
-unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src,
- int src_stride, int x_offset,
- int y_offset, const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- unsigned int sse1;
- const int se1 = aom_sub_pixel_variance32xh_avx2(
- src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
- unsigned int sse2;
- const int se2 =
- aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
- dst + 32, dst_stride, 64, &sse2);
- const int se = se1 + se2;
- unsigned int variance;
- *sse = sse1 + sse2;
-
- variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
- _mm256_zeroupper();
- return variance;
-}
-
-unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
- int src_stride, int x_offset,
- int y_offset, const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- const int se = aom_sub_pixel_variance32xh_avx2(
- src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
-
- const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
- _mm256_zeroupper();
- return variance;
-}
+#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
+ /*Avoid overflow in helper by capping height.*/ \
+ const int hf = AOMMIN(h, 64); \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_variance##wf##xh_avx2( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+ &sse2); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \
+ }
-unsigned int aom_sub_pixel_avg_variance64x64_avx2(
- const uint8_t *src, int src_stride, int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
- unsigned int sse1;
- const int se1 = aom_sub_pixel_avg_variance32xh_avx2(
- src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
- unsigned int sse2;
- const int se2 = aom_sub_pixel_avg_variance32xh_avx2(
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
- 64, 64, &sse2);
- const int se = se1 + se2;
- unsigned int variance;
-
- *sse = sse1 + sse2;
-
- variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
- _mm256_zeroupper();
- return variance;
-}
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
+
+#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \
+ const uint8_t *sec) { \
+ /*Avoid overflow in helper by capping height.*/ \
+ const int hf = AOMMIN(h, 64); \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ const uint8_t *sec_ptr = sec; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+ sec_ptr, w, hf, &sse2); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ sec_ptr += hf * w; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ sec += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \
+ }
-unsigned int aom_sub_pixel_avg_variance32x32_avx2(
- const uint8_t *src, int src_stride, int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
- // Process 32 elements in parallel.
- const int se = aom_sub_pixel_avg_variance32xh_avx2(
- src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
-
- const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
- _mm256_zeroupper();
- return variance;
-}
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
const __m256i d =
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 4ef1682c2..c8c90a7dc 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -344,57 +344,56 @@ DECLS(ssse3);
#undef DECLS
#undef DECL
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
- unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \
- const uint8_t *src, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
- unsigned int sse; \
- int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
- y_offset, dst, dst_stride, \
- h, &sse, NULL, NULL); \
- if (w > wf) { \
- unsigned int sse2; \
- int se2 = aom_sub_pixel_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = aom_sub_pixel_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = aom_sub_pixel_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sse_ptr = sse; \
- return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
+ /*Avoid overflow in helper by capping height.*/ \
+ const int hf = AOMMIN(h, 64); \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+ &sse2, NULL, NULL); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
}
-#define FNS(opt) \
- FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
- FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \
- FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \
- FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \
- FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \
- FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \
- FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
- FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \
- FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \
- FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \
- FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \
- FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \
- FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \
- FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \
- FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \
- FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \
- FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \
- FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \
+ FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \
+ FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \
+ FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \
+ FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \
+ FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \
+ FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \
+ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \
+ FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \
+ FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \
+ FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \
FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
FNS(sse2);
@@ -420,58 +419,60 @@ DECLS(ssse3);
#undef DECL
#undef DECLS
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
- unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, int dst_stride, unsigned int *sseptr, \
- const uint8_t *sec) { \
- unsigned int sse; \
- int se = aom_sub_pixel_avg_variance##wf##xh_##opt( \
- src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
- NULL, NULL); \
- if (w > wf) { \
- unsigned int sse2; \
- int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
- sec + 16, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
- sec + 32, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
- sec + 48, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sseptr = sse; \
- return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \
+ const uint8_t *sec) { \
+ /*Avoid overflow in helper by capping height.*/ \
+ const int hf = AOMMIN(h, 64); \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ const uint8_t *sec_ptr = sec; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+ sec_ptr, w, hf, &sse2, NULL, NULL); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ sec_ptr += hf * w; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ sec += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
}
-#define FNS(opt) \
- FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
- FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \
- FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \
- FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \
- FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \
- FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \
- FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
- FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \
- FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \
- FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \
- FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \
- FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \
- FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \
- FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \
- FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \
- FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \
- FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \
- FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \
+ FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \
+ FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \
+ FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \
+ FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \
+ FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \
+ FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \
+ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \
+ FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \
+ FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \
+ FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \
FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
FNS(sse2);