summaryrefslogtreecommitdiff
path: root/aom_dsp
diff options
context:
space:
mode:
authorNeil Birkbeck <birkbeck@google.com>2018-06-12 08:03:48 -0700
committerYaowu Xu <yaowu@google.com>2018-06-15 17:01:39 +0000
commit04721792760c4cdf52b39b579391441a0927e349 (patch)
tree43b59ebde47ccbf66b4463709356cc8f3229805f /aom_dsp
parent58f109babf7af8bfe5f0f9b50ae78da4fe7f809a (diff)
Add 2d wiener denoiser for regrainer
This also adds support for 2d fft (float) to be used for both denoising and noise power spectral density estimation. Change-Id: Ie95b44280bb301dfd3f0cf06d139e307d2f4e11b
Diffstat (limited to 'aom_dsp')
-rw-r--r--aom_dsp/aom_dsp.cmake4
-rwxr-xr-xaom_dsp/aom_dsp_rtcd_defs.pl28
-rw-r--r--aom_dsp/fft.c219
-rw-r--r--aom_dsp/fft_common.h1050
-rw-r--r--aom_dsp/noise_model.c190
-rw-r--r--aom_dsp/noise_model.h19
-rw-r--r--aom_dsp/noise_util.c111
-rw-r--r--aom_dsp/noise_util.h38
-rw-r--r--aom_dsp/x86/fft_avx2.c73
-rw-r--r--aom_dsp/x86/fft_sse2.c166
10 files changed, 1898 insertions, 0 deletions
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index a49177906..768875f7d 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -26,6 +26,8 @@ list(APPEND AOM_DSP_COMMON_SOURCES
"${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
"${AOM_ROOT}/aom_dsp/entcode.c"
"${AOM_ROOT}/aom_dsp/entcode.h"
+ "${AOM_ROOT}/aom_dsp/fft.c"
+ "${AOM_ROOT}/aom_dsp/fft_common.h"
"${AOM_ROOT}/aom_dsp/intrapred.c"
"${AOM_ROOT}/aom_dsp/intrapred_common.h"
"${AOM_ROOT}/aom_dsp/loopfilter.c"
@@ -54,6 +56,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
"${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
"${AOM_ROOT}/aom_dsp/x86/convolve.h"
"${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
@@ -81,6 +84,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
"${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
"${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
+ "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c")
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 0b23b954b..38084d4ae 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -484,6 +484,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/aom_highbd_fdct8x8 sse2/;
+ # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
+ add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
+
+ add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft4x4_float sse2/;
+
+ add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft8x8_float avx2 sse2/;
+
+ add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft16x16_float avx2 sse2/;
+
+ add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft32x32_float avx2 sse2/;
+
+ add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
+
+ add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft4x4_float sse2/;
+
+ add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft8x8_float avx2 sse2/;
+
+ add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft16x16_float avx2 sse2/;
+
+ add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft32x32_float avx2 sse2/;
} # CONFIG_AV1_ENCODER
#
diff --git a/aom_dsp/fft.c b/aom_dsp/fft.c
new file mode 100644
index 000000000..0ba71cfb3
--- /dev/null
+++ b/aom_dsp/fft.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void simple_transpose(const float *A, float *B, int n) {
+ for (int y = 0; y < n; y++) {
+ for (int x = 0; x < n; x++) {
+ B[y * n + x] = A[x * n + y];
+ }
+ }
+}
+
+// The 1d transform is real to complex and packs the complex results in
+// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
+// components, followed by the n/2 - 1 imaginary components). After the
+// transform is done on the rows, the first n/2 + 1 columns are real, and
+// the remaining are the imaginary components. After the transform on the
+// columns, the region of [0, n/2]x[0, n/2] contains the real part of
+// fft of the real columns. The real part of the 2d fft also includes the
+// imaginary part of transformed imaginary columns. This function assembles
+// the correct outputs while putting the real and imaginary components
+// next to each other.
+static INLINE void unpack_2d_output(const float *col_fft, float *output,
+ int n) {
+ for (int y = 0; y <= n / 2; ++y) {
+ const int y2 = y + n / 2;
+ const int y_extra = y2 > n / 2 && y2 < n;
+
+ for (int x = 0; x <= n / 2; ++x) {
+ const int x2 = x + n / 2;
+ const int x_extra = x2 > n / 2 && x2 < n;
+ output[2 * (y * n + x)] =
+ col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+ output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
+ (x_extra ? col_fft[y * n + x2] : 0);
+ if (y_extra) {
+ output[2 * ((n - y) * n + x)] =
+ col_fft[y * n + x] +
+ (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+ output[2 * ((n - y) * n + x) + 1] =
+ -(y_extra ? col_fft[y2 * n + x] : 0) +
+ (x_extra ? col_fft[y * n + x2] : 0);
+ }
+ }
+ }
+}
+
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+ aom_fft_unpack_func_t unpack, int vec_size) {
+ for (int x = 0; x < n; x += vec_size) {
+ tform(input + x, output + x, n);
+ }
+ transpose(output, temp, n);
+
+ for (int x = 0; x < n; x += vec_size) {
+ tform(temp + x, output + x, n);
+ }
+ transpose(output, temp, n);
+
+ unpack(temp, output, n);
+}
+
+static INLINE void store_float(float *output, float input) { *output = input; }
+static INLINE float add_float(float a, float b) { return a + b; }
+static INLINE float sub_float(float a, float b) { return a - b; }
+static INLINE float mul_float(float a, float b) { return a * b; }
+
+GEN_FFT_2(void, float, float, float, *, store_float);
+GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
+ sub_float);
+GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+
+void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+ aom_fft_1d_func_t ifft_multi,
+ aom_fft_transpose_func_t transpose, int vec_size) {
+ // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
+ // and get real outputs.
+ for (int y = 0; y <= n / 2; ++y) {
+ output[y * n] = input[2 * y * n];
+ output[y * n + 1] = input[2 * (y * n + n / 2)];
+ }
+ for (int y = n / 2 + 1; y < n; ++y) {
+ output[y * n] = input[2 * (y - n / 2) * n + 1];
+ output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
+ }
+
+ for (int i = 0; i < 2; i += vec_size) {
+ ifft_multi(output + i, temp + i, n);
+ }
+
+ // For the other columns, since we don't have a full ifft for complex inputs
+ // we have to split them into the real and imaginary counterparts.
+ // Pack the real component, then the imaginary components.
+ for (int y = 0; y < n; ++y) {
+ for (int x = 1; x < n / 2; ++x) {
+ output[y * n + (x + 1)] = input[2 * (y * n + x)];
+ }
+ for (int x = 1; x < n / 2; ++x) {
+ output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
+ }
+ }
+ for (int y = 2; y < vec_size; y++) {
+ fft_single(output + y, temp + y, n);
+ }
+ // This is the part that can be sped up with SIMD
+ for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
+ fft_multi(output + y, temp + y, n);
+ }
+
+ // Put the 0 and n/2 th results in the correct place.
+ for (int x = 0; x < n; ++x) {
+ output[x] = temp[x * n];
+ output[(n / 2) * n + x] = temp[x * n + 1];
+ }
+ // This rearranges and transposes.
+ for (int y = 1; y < n / 2; ++y) {
+ // Fill in the real columns
+ for (int x = 0; x <= n / 2; ++x) {
+ output[x + y * n] =
+ temp[(y + 1) + x * n] +
+ ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
+ }
+ for (int x = n / 2 + 1; x < n; ++x) {
+ output[x + y * n] = temp[(y + 1) + (n - x) * n] -
+ temp[(y + n / 2) + ((n - x) + n / 2) * n];
+ }
+ // Fill in the imag columns
+ for (int x = 0; x <= n / 2; ++x) {
+ output[x + (y + n / 2) * n] =
+ temp[(y + n / 2) + x * n] -
+ ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
+ }
+ for (int x = n / 2 + 1; x < n; ++x) {
+ output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
+ temp[(y + n / 2) + (n - x) * n];
+ }
+ }
+ for (int y = 0; y < n; y += vec_size) {
+ ifft_multi(output + y, temp + y, n);
+ }
+ transpose(temp, output, n);
+}
+
+GEN_IFFT_2(void, float, float, float, *, store_float);
+GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
+ sub_float);
+GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+
+void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
+ aom_ifft1d_2_float, simple_transpose, 1);
+}
+
+void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
+ aom_ifft1d_4_float, simple_transpose, 1);
+}
+
+void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
+ aom_ifft1d_8_float, simple_transpose, 1);
+}
+
+void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
+}
+
+void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
+}
diff --git a/aom_dsp/fft_common.h b/aom_dsp/fft_common.h
new file mode 100644
index 000000000..2f3cd5fdc
--- /dev/null
+++ b/aom_dsp/fft_common.h
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_FFT_COMMON_H_
+#define AOM_DSP_FFT_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief A function pointer for computing 1d fft and ifft.
+ *
+ * The function will point to an implementation for a specific transform size,
+ * and may perform the transforms using vectorized instructions.
+ *
+ * For a non-vectorized forward transforms of size n, the input and output
+ * buffers will be size n. The output takes advantage of conjugate symmetry and
+ * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
+ * (r_{j}, i_{j}) is the complex output for index j.
+ *
+ * An inverse transform will assume that the complex "input" is packed
+ * similarly. Its output will be real.
+ *
+ * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
+ *
+ * Vectorized implementations are parallelized along the columns so that the fft
+ * can be performed on multiple columns at a time. In such cases the data block
+ * for input and output is typically square (n x n) and the stride will
+ * correspond to the spacing between rows. At minimum, the input size must be
+ * n x simd_vector_length.
+ *
+ * \param[in] input Input buffer. See above for size restrictions.
+ * \param[out] output Output buffer. See above for size restrictions.
+ * \param[in] stride The spacing in number of elements between rows
+ * (or elements)
+ */
+typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
+ int stride);
+
+// Declare some of the forward non-vectorized transforms which are used in some
+// of the vectorized implementations
+void aom_fft1d_4_float(const float *input, float *output, int stride);
+void aom_fft1d_8_float(const float *input, float *output, int stride);
+void aom_fft1d_16_float(const float *input, float *output, int stride);
+void aom_fft1d_32_float(const float *input, float *output, int stride);
+
+/**\!brief Function pointer for transposing a matrix of floats.
+ *
+ * \param[in] input Input buffer (size n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in] n Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
+ int n);
+
+/**\!brief Function pointer for re-arranging intermediate 2d transform results.
+ *
+ * After re-arrangement, the real and imaginary components will be packed
+ * tightly next to each other.
+ *
+ * \param[in] input Input buffer (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in] n Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
+
+/*!\brief Performs a 2d fft with the given functions.
+ *
+ * This generator function allows for multiple different implementations of 2d
+ * fft with different vector operations, without having to redefine the main
+ * body multiple times.
+ *
+ * \param[in] input Input buffer to run the transform on (size n x n)
+ * \param[out] temp Working buffer for computing the transform (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in] tform Forward transform function
+ * \param[in] transpose Transpose function (for n x n matrix)
+ * \param[in] unpack Unpack function used to massage outputs to correct form
+ * \param[in] vec_size Vector size (the transform is done vec_size units at
+ * a time)
+ */
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+ aom_fft_unpack_func_t unpack, int vec_size);
+
+/*!\brief Perform a 2d inverse fft with the given helper functions
+ *
+ * \param[in] input Input buffer to run the transform on (size 2 x n x n)
+ * \param[out] temp Working buffer for computations (size 2 x n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in] fft_single Forward transform function (non vectorized)
+ * \param[in] fft_multi Forward transform function (vectorized)
+ * \param[in] ifft_multi Inverse transform function (vectorized)
+ * \param[in] transpose Transpose function (for n x n matrix)
+ * \param[in] vec_size Vector size (the transform is done vec_size
+ * units at a time)
+ */
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+ aom_fft_1d_func_t ifft_multi,
+ aom_fft_transpose_func_t transpose, int vec_size);
+#ifdef __cplusplus
+}
+#endif
+
+// The macros below define 1D fft/ifft for different data types and for
+// different simd vector intrinsic types.
+
+#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \
+ ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ store(output + 0 * stride, i0 + i1); \
+ store(output + 1 * stride, i0 - i1); \
+ }
+
+#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+ ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC w0 = add(i0, i2); \
+ const T_VEC w1 = sub(i0, i2); \
+ const T_VEC w2 = add(i1, i3); \
+ const T_VEC w3 = sub(i1, i3); \
+ store(output + 0 * stride, add(w0, w2)); \
+ store(output + 1 * stride, w1); \
+ store(output + 2 * stride, sub(w0, w2)); \
+ store(output + 3 * stride, sub(kWeight0, w3)); \
+ }
+
+#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
+ ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC w0 = add(i0, i4); \
+ const T_VEC w1 = sub(i0, i4); \
+ const T_VEC w2 = add(i2, i6); \
+ const T_VEC w3 = sub(i2, i6); \
+ const T_VEC w4 = add(w0, w2); \
+ const T_VEC w5 = sub(w0, w2); \
+ const T_VEC w7 = add(i1, i5); \
+ const T_VEC w8 = sub(i1, i5); \
+ const T_VEC w9 = add(i3, i7); \
+ const T_VEC w10 = sub(i3, i7); \
+ const T_VEC w11 = add(w7, w9); \
+ const T_VEC w12 = sub(w7, w9); \
+ store(output + 0 * stride, add(w4, w11)); \
+ store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \
+ store(output + 2 * stride, w5); \
+ store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \
+ store(output + 4 * stride, sub(w4, w11)); \
+ store(output + 5 * stride, \
+ sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \
+ store(output + 6 * stride, sub(kWeight0, w12)); \
+ store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \
+ }
+
+#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC w0 = add(i0, i8); \
+ const T_VEC w1 = sub(i0, i8); \
+ const T_VEC w2 = add(i4, i12); \
+ const T_VEC w3 = sub(i4, i12); \
+ const T_VEC w4 = add(w0, w2); \
+ const T_VEC w5 = sub(w0, w2); \
+ const T_VEC w7 = add(i2, i10); \
+ const T_VEC w8 = sub(i2, i10); \
+ const T_VEC w9 = add(i6, i14); \
+ const T_VEC w10 = sub(i6, i14); \
+ const T_VEC w11 = add(w7, w9); \
+ const T_VEC w12 = sub(w7, w9); \
+ const T_VEC w14 = add(w4, w11); \
+ const T_VEC w15 = sub(w4, w11); \
+ const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(sub(kWeight0, w3), \
+ mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(w3, mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w19 = add(i1, i9); \
+ const T_VEC w20 = sub(i1, i9); \
+ const T_VEC w21 = add(i5, i13); \
+ const T_VEC w22 = sub(i5, i13); \
+ const T_VEC w23 = add(w19, w21); \
+ const T_VEC w24 = sub(w19, w21); \
+ const T_VEC w26 = add(i3, i11); \
+ const T_VEC w27 = sub(i3, i11); \
+ const T_VEC w28 = add(i7, i15); \
+ const T_VEC w29 = sub(i7, i15); \
+ const T_VEC w30 = add(w26, w28); \
+ const T_VEC w31 = sub(w26, w28); \
+ const T_VEC w33 = add(w23, w30); \
+ const T_VEC w34 = sub(w23, w30); \
+ const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(sub(kWeight0, w22), \
+ mul(kWeight2, add(w29, w27))) }; \
+ const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(w22, mul(kWeight2, add(w29, w27))) }; \
+ store(output + 0 * stride, add(w14, w33)); \
+ store(output + 1 * stride, \
+ add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
+ store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \
+ store(output + 3 * stride, \
+ add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
+ store(output + 4 * stride, w15); \
+ store(output + 5 * stride, \
+ add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \
+ mul(kWeight3, w37[1])))); \
+ store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \
+ store(output + 7 * stride, \
+ add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \
+ mul(kWeight4, w35[1])))); \
+ store(output + 8 * stride, sub(w14, w33)); \
+ store(output + 9 * stride, \
+ add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
+ store(output + 10 * stride, \
+ sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \
+ store(output + 11 * stride, \
+ add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
+ store(output + 12 * stride, sub(kWeight0, w34)); \
+ store(output + 13 * stride, \
+ sub(sub(kWeight0, w18[1]), \
+ sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \
+ store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \
+ store(output + 15 * stride, \
+ sub(sub(kWeight0, w16[1]), \
+ sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \
+ }
+
+#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC kWeight5 = constant(0.980785f); \
+ const T_VEC kWeight6 = constant(0.19509f); \
+ const T_VEC kWeight7 = constant(0.83147f); \
+ const T_VEC kWeight8 = constant(0.55557f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC i16 = load(input + 16 * stride); \
+ const T_VEC i17 = load(input + 17 * stride); \
+ const T_VEC i18 = load(input + 18 * stride); \
+ const T_VEC i19 = load(input + 19 * stride); \
+ const T_VEC i20 = load(input + 20 * stride); \
+ const T_VEC i21 = load(input + 21 * stride); \
+ const T_VEC i22 = load(input + 22 * stride); \
+ const T_VEC i23 = load(input + 23 * stride); \
+ const T_VEC i24 = load(input + 24 * stride); \
+ const T_VEC i25 = load(input + 25 * stride); \
+ const T_VEC i26 = load(input + 26 * stride); \
+ const T_VEC i27 = load(input + 27 * stride); \
+ const T_VEC i28 = load(input + 28 * stride); \
+ const T_VEC i29 = load(input + 29 * stride); \
+ const T_VEC i30 = load(input + 30 * stride); \
+ const T_VEC i31 = load(input + 31 * stride); \
+ const T_VEC w0 = add(i0, i16); \
+ const T_VEC w1 = sub(i0, i16); \
+ const T_VEC w2 = add(i8, i24); \
+ const T_VEC w3 = sub(i8, i24); \
+ const T_VEC w4 = add(w0, w2); \
+ const T_VEC w5 = sub(w0, w2); \
+ const T_VEC w7 = add(i4, i20); \
+ const T_VEC w8 = sub(i4, i20); \
+ const T_VEC w9 = add(i12, i28); \
+ const T_VEC w10 = sub(i12, i28); \
+ const T_VEC w11 = add(w7, w9); \
+ const T_VEC w12 = sub(w7, w9); \
+ const T_VEC w14 = add(w4, w11); \
+ const T_VEC w15 = sub(w4, w11); \
+ const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(sub(kWeight0, w3), \
+ mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(w3, mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w19 = add(i2, i18); \
+ const T_VEC w20 = sub(i2, i18); \
+ const T_VEC w21 = add(i10, i26); \
+ const T_VEC w22 = sub(i10, i26); \
+ const T_VEC w23 = add(w19, w21); \
+ const T_VEC w24 = sub(w19, w21); \
+ const T_VEC w26 = add(i6, i22); \
+ const T_VEC w27 = sub(i6, i22); \
+ const T_VEC w28 = add(i14, i30); \
+ const T_VEC w29 = sub(i14, i30); \
+ const T_VEC w30 = add(w26, w28); \
+ const T_VEC w31 = sub(w26, w28); \
+ const T_VEC w33 = add(w23, w30); \
+ const T_VEC w34 = sub(w23, w30); \
+ const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(sub(kWeight0, w22), \
+ mul(kWeight2, add(w29, w27))) }; \
+ const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(w22, mul(kWeight2, add(w29, w27))) }; \
+ const T_VEC w38 = add(w14, w33); \
+ const T_VEC w39 = sub(w14, w33); \
+ const T_VEC w40[2] = { \
+ add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \
+ add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \
+ }; \
+ const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \
+ sub(sub(kWeight0, w12), \
+ mul(kWeight2, add(w31, w24))) }; \
+ const T_VEC w42[2] = { \
+ add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \
+ add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \
+ }; \
+ const T_VEC w44[2] = { \
+ add(w18[0], \
+ sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
+ sub(sub(kWeight0, w18[1]), \
+ sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \
+ }; \
+ const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \
+ sub(w12, mul(kWeight2, add(w31, w24))) }; \
+ const T_VEC w46[2] = { \
+ add(w16[0], \
+ sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
+ sub(sub(kWeight0, w16[1]), \
+ sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \
+ }; \
+ const T_VEC w47 = add(i1, i17); \
+ const T_VEC w48 = sub(i1, i17); \
+ const T_VEC w49 = add(i9, i25); \
+ const T_VEC w50 = sub(i9, i25); \
+ const T_VEC w51 = add(w47, w49); \
+ const T_VEC w52 = sub(w47, w49); \
+ const T_VEC w54 = add(i5, i21); \
+ const T_VEC w55 = sub(i5, i21); \
+ const T_VEC w56 = add(i13, i29); \
+ const T_VEC w57 = sub(i13, i29); \
+ const T_VEC w58 = add(w54, w56); \
+ const T_VEC w59 = sub(w54, w56); \
+ const T_VEC w61 = add(w51, w58); \
+ const T_VEC w62 = sub(w51, w58); \
+ const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \
+ sub(sub(kWeight0, w50), \
+ mul(kWeight2, add(w57, w55))) }; \
+ const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \
+ sub(w50, mul(kWeight2, add(w57, w55))) }; \
+ const T_VEC w66 = add(i3, i19); \
+ const T_VEC w67 = sub(i3, i19); \
+ const T_VEC w68 = add(i11, i27); \
+ const T_VEC w69 = sub(i11, i27); \
+ const T_VEC w70 = add(w66, w68); \
+ const T_VEC w71 = sub(w66, w68); \
+ const T_VEC w73 = add(i7, i23); \
+ const T_VEC w74 = sub(i7, i23); \
+ const T_VEC w75 = add(i15, i31); \
+ const T_VEC w76 = sub(i15, i31); \
+ const T_VEC w77 = add(w73, w75); \
+ const T_VEC w78 = sub(w73, w75); \
+ const T_VEC w80 = add(w70, w77); \
+ const T_VEC w81 = sub(w70, w77); \
+ const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \
+ sub(sub(kWeight0, w69), \
+ mul(kWeight2, add(w76, w74))) }; \
+ const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \
+ sub(w69, mul(kWeight2, add(w76, w74))) }; \
+ const T_VEC w85 = add(w61, w80); \
+ const T_VEC w86 = sub(w61, w80); \
+ const T_VEC w87[2] = { \
+ add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \
+ add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \
+ }; \
+ const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \
+ sub(sub(kWeight0, w59), \
+ mul(kWeight2, add(w78, w71))) }; \
+ const T_VEC w89[2] = { \
+ add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \
+ add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \
+ }; \
+ const T_VEC w91[2] = { \
+ add(w65[0], \
+ sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
+ sub(sub(kWeight0, w65[1]), \
+ sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \
+ }; \
+ const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \
+ sub(w59, mul(kWeight2, add(w78, w71))) }; \
+ const T_VEC w93[2] = { \
+ add(w63[0], \
+ sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
+ sub(sub(kWeight0, w63[1]), \
+ sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \
+ }; \
+ store(output + 0 * stride, add(w38, w85)); \
+ store(output + 1 * stride, \
+ add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \
+ store(output + 2 * stride, \
+ add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \
+ store(output + 3 * stride, \
+ add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \
+ store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \
+ store(output + 5 * stride, \
+ add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \
+ store(output + 6 * stride, \
+ add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \
+ store(output + 7 * stride, \
+ add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \
+ store(output + 8 * stride, w39); \
+ store(output + 9 * stride, \
+ add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \
+ mul(kWeight5, w93[1])))); \
+ store(output + 10 * stride, \
+ add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \
+ mul(kWeight3, w92[1])))); \
+ store(output + 11 * stride, \
+ add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \
+ mul(kWeight7, w91[1])))); \
+ store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \
+ store(output + 13 * stride, \
+ add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \
+ mul(kWeight8, w89[1])))); \
+ store(output + 14 * stride, \
+ add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \
+ mul(kWeight4, w88[1])))); \
+ store(output + 15 * stride, \
+ add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \
+ mul(kWeight6, w87[1])))); \
+ store(output + 16 * stride, sub(w38, w85)); \
+ store(output + 17 * stride, \
+ add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \
+ store(output + 18 * stride, \
+ add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \
+ store(output + 19 * stride, \
+ add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \
+ store(output + 20 * stride, \
+ sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \
+ store(output + 21 * stride, \
+ add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \
+ store(output + 22 * stride, \
+ add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \
+ store(output + 23 * stride, \
+ add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \
+ store(output + 24 * stride, sub(kWeight0, w86)); \
+ store(output + 25 * stride, \
+ sub(sub(kWeight0, w46[1]), \
+ sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \
+ store(output + 26 * stride, \
+ sub(sub(kWeight0, w45[1]), \
+ sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \
+ store(output + 27 * stride, \
+ sub(sub(kWeight0, w44[1]), \
+ sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \
+ store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \
+ store(output + 29 * stride, \
+ sub(sub(kWeight0, w42[1]), \
+ sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \
+ store(output + 30 * stride, \
+ sub(sub(kWeight0, w41[1]), \
+ sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \
+ store(output + 31 * stride, \
+ sub(sub(kWeight0, w40[1]), \
+ sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \
+ }
+
+#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \
+ ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ store(output + 0 * stride, i0 + i1); \
+ store(output + 1 * stride, i0 - i1); \
+ }
+
+#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+ ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC w2 = add(i0, i2); \
+ const T_VEC w3 = sub(i0, i2); \
+ const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \
+ const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \
+ store(output + 0 * stride, add(w2, w4[0])); \
+ store(output + 1 * stride, add(w3, w5[1])); \
+ store(output + 2 * stride, sub(w2, w4[0])); \
+ store(output + 3 * stride, sub(w3, w5[1])); \
+ }
+
+#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC w6 = add(i0, i4); \
+ const T_VEC w7 = sub(i0, i4); \
+ const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \
+ const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \
+ const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \
+ const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \
+ const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \
+ const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \
+ const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \
+ const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \
+ const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \
+ const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \
+ const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \
+ const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \
+ const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \
+ const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \
+ store(output + 0 * stride, add(w10[0], w18[0])); \
+ store(output + 1 * stride, \
+ add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \
+ store(output + 2 * stride, add(w11[0], w19[1])); \
+ store(output + 3 * stride, \
+ sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
+ store(output + 4 * stride, sub(w10[0], w18[0])); \
+ store(output + 5 * stride, \
+ add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \
+ mul(kWeight2, w20[1])))); \
+ store(output + 6 * stride, sub(w11[0], w19[1])); \
+ store(output + 7 * stride, \
+ add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
+ }
+
+#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC w14 = add(i0, i8); \
+ const T_VEC w15 = sub(i0, i8); \
+ const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \
+ const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \
+ const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \
+ const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \
+ const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \
+ const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \
+ const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \
+ const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \
+ const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \
+ const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \
+ const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \
+ const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \
+ const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \
+ const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \
+ const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \
+ const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \
+ const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \
+ add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
+ const T_VEC w33[2] = { add(w20[0], \
+ sub(sub(kWeight0, mul(kWeight2, w28[0])), \
+ mul(kWeight2, w28[1]))), \
+ add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
+ const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \
+ const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \
+ const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
+ sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+ const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
+ add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+ const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \
+ const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \
+ const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \
+ const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \
+ const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \
+ const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
+ const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
+ const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
+ const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \
+ const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \
+ const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \
+ const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \
+ const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \
+ const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \
+ const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \
+ const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \
+ const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \
+ const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \
+ const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \
+ add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
+ const T_VEC w57[2] = { add(w44[0], \
+ sub(sub(kWeight0, mul(kWeight2, w52[0])), \
+ mul(kWeight2, w52[1]))), \
+ add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
+ const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \
+ const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \
+ const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
+ sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+ const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
+ add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+ store(output + 0 * stride, add(w30[0], w54[0])); \
+ store(output + 1 * stride, \
+ add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \
+ store(output + 2 * stride, \
+ add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \
+ store(output + 3 * stride, \
+ add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \
+ store(output + 4 * stride, add(w31[0], w55[1])); \
+ store(output + 5 * stride, \
+ sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
+ store(output + 6 * stride, \
+ sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
+ store(output + 7 * stride, \
+ sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
+ store(output + 8 * stride, sub(w30[0], w54[0])); \
+ store(output + 9 * stride, \
+ add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \
+ mul(kWeight4, w56[1])))); \
+ store(output + 10 * stride, \
+ add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \
+ mul(kWeight2, w58[1])))); \
+ store(output + 11 * stride, \
+ add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \
+ mul(kWeight3, w60[1])))); \
+ store(output + 12 * stride, sub(w31[0], w55[1])); \
+ store(output + 13 * stride, \
+ add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
+ store(output + 14 * stride, \
+ add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
+ store(output + 15 * stride, \
+ add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
+ }
+#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC kWeight5 = constant(0.980785f); \
+ const T_VEC kWeight6 = constant(0.19509f); \
+ const T_VEC kWeight7 = constant(0.83147f); \
+ const T_VEC kWeight8 = constant(0.55557f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC i16 = load(input + 16 * stride); \
+ const T_VEC i17 = load(input + 17 * stride); \
+ const T_VEC i18 = load(input + 18 * stride); \
+ const T_VEC i19 = load(input + 19 * stride); \
+ const T_VEC i20 = load(input + 20 * stride); \
+ const T_VEC i21 = load(input + 21 * stride); \
+ const T_VEC i22 = load(input + 22 * stride); \
+ const T_VEC i23 = load(input + 23 * stride); \
+ const T_VEC i24 = load(input + 24 * stride); \
+ const T_VEC i25 = load(input + 25 * stride); \
+ const T_VEC i26 = load(input + 26 * stride); \
+ const T_VEC i27 = load(input + 27 * stride); \
+ const T_VEC i28 = load(input + 28 * stride); \
+ const T_VEC i29 = load(input + 29 * stride); \
+ const T_VEC i30 = load(input + 30 * stride); \
+ const T_VEC i31 = load(input + 31 * stride); \
+ const T_VEC w30 = add(i0, i16); \
+ const T_VEC w31 = sub(i0, i16); \
+ const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \
+ const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \
+ const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \
+ const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \
+ const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \
+ const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \
+ const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \
+ const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \
+ const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \
+ const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \
+ const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \
+ const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
+ const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
+ const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
+ const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \
+ const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \
+ const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \
+ add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \
+ const T_VEC w49[2] = { add(w36[0], \
+ sub(sub(kWeight0, mul(kWeight2, w44[0])), \
+ mul(kWeight2, w44[1]))), \
+ add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \
+ const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \
+ const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \
+ const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
+ sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
+ const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
+ add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
+ const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \
+ const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \
+ const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \
+ const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \
+ const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \
+ const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \
+ const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \
+ const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \
+ const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \
+ const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \
+ const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \
+ const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \
+ const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \
+ const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \
+ const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \
+ const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \
+ const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \
+ const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \
+ const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \
+ add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \
+ const T_VEC w73[2] = { add(w60[0], \
+ sub(sub(kWeight0, mul(kWeight2, w68[0])), \
+ mul(kWeight2, w68[1]))), \
+ add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \
+ const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \
+ const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \
+ const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
+ sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
+ const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
+ add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
+ const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \
+ const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \
+ const T_VEC w80[2] = { \
+ add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \
+ add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \
+ }; \
+ const T_VEC w81[2] = { \
+ add(w48[0], \
+ sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \
+ add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \
+ }; \
+ const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \
+ add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \
+ const T_VEC w83[2] = { add(w50[0], \
+ sub(sub(kWeight0, mul(kWeight2, w74[0])), \
+ mul(kWeight2, w74[1]))), \
+ add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \
+ const T_VEC w84[2] = { \
+ add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \
+ add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \
+ }; \
+ const T_VEC w85[2] = { \
+ add(w52[0], \
+ sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \
+ add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \
+ }; \
+ const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \
+ const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \
+ const T_VEC w88[2] = { \
+ sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
+ add(w49[1], \
+ sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \
+ }; \
+ const T_VEC w89[2] = { \
+ add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
+ add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \
+ }; \
+ const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
+ sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
+ const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
+ add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
+ const T_VEC w92[2] = { \
+ sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
+ add(w53[1], \
+ sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \
+ }; \
+ const T_VEC w93[2] = { \
+ add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
+ add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \
+ }; \
+ const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \
+ const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \
+ const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \
+ const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \
+ const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \
+ const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \
+ const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \
+ const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \
+ const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \
+ const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \
+ const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \
+ const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \
+ const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \
+ const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \
+ const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \
+ const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \
+ const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \
+ const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \
+ const T_VEC w112[2] = { \
+ add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \
+ add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \
+ }; \
+ const T_VEC w113[2] = { \
+ add(w100[0], \
+ sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
+ add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \
+ }; \
+ const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \
+ const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \
+ const T_VEC w116[2] = { \
+ sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
+ sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
+ }; \
+ const T_VEC w117[2] = { \
+ add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
+ add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
+ }; \
+ const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \
+ const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \
+ const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \
+ const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \
+ const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \
+ const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \
+ const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \
+ const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \
+ const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \
+ const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \
+ const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \
+ const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \
+ const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \
+ const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \
+ const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \
+ const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \
+ const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \
+ const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \
+ const T_VEC w136[2] = { \
+ add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \
+ add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \
+ }; \
+ const T_VEC w137[2] = { \
+ add(w124[0], \
+ sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
+ add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \
+ }; \
+ const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \
+ const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \
+ const T_VEC w140[2] = { \
+ sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
+ sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
+ }; \
+ const T_VEC w141[2] = { \
+ add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
+ add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
+ }; \
+ const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \
+ const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \
+ const T_VEC w144[2] = { \
+ add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \
+ add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \
+ }; \
+ const T_VEC w145[2] = { \
+ add(w112[0], \
+ sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
+ add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \
+ }; \
+ const T_VEC w146[2] = { \
+ add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \
+ add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \
+ }; \
+ const T_VEC w147[2] = { \
+ add(w114[0], \
+ sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
+ add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \
+ }; \
+ const T_VEC w148[2] = { \
+ add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \
+ add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \
+ }; \
+ const T_VEC w149[2] = { \
+ add(w116[0], \
+ sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
+ add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \
+ }; \
+ const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \
+ const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \
+ const T_VEC w152[2] = { \
+ sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
+ add(w113[1], \
+ sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \
+ }; \
+ const T_VEC w153[2] = { \
+ add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
+ add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \
+ }; \
+ const T_VEC w154[2] = { \
+ sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
+ sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
+ }; \
+ const T_VEC w155[2] = { \
+ add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
+ add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
+ }; \
+ const T_VEC w156[2] = { \
+ sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
+ add(w117[1], \
+ sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \
+ }; \
+ const T_VEC w157[2] = { \
+ add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
+ add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \
+ }; \
+ store(output + 0 * stride, add(w78[0], w142[0])); \
+ store(output + 1 * stride, \
+ add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \
+ store(output + 2 * stride, \
+ add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \
+ store(output + 3 * stride, \
+ add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \
+ store(output + 4 * stride, \
+ add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \
+ store(output + 5 * stride, \
+ add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \
+ store(output + 6 * stride, \
+ add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \
+ store(output + 7 * stride, \
+ add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \
+ store(output + 8 * stride, add(w79[0], w143[1])); \
+ store(output + 9 * stride, \
+ sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
+ store(output + 10 * stride, \
+ sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
+ store(output + 11 * stride, \
+ sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
+ store(output + 12 * stride, \
+ sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
+ store(output + 13 * stride, \
+ sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
+ store(output + 14 * stride, \
+ sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
+ store(output + 15 * stride, \
+ sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
+ store(output + 16 * stride, sub(w78[0], w142[0])); \
+ store(output + 17 * stride, \
+ add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \
+ mul(kWeight6, w144[1])))); \
+ store(output + 18 * stride, \
+ add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \
+ mul(kWeight4, w146[1])))); \
+ store(output + 19 * stride, \
+ add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \
+ mul(kWeight8, w148[1])))); \
+ store(output + 20 * stride, \
+ add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \
+ mul(kWeight2, w150[1])))); \
+ store(output + 21 * stride, \
+ add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \
+ mul(kWeight7, w152[1])))); \
+ store(output + 22 * stride, \
+ add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \
+ mul(kWeight3, w154[1])))); \
+ store(output + 23 * stride, \
+ add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \
+ mul(kWeight5, w156[1])))); \
+ store(output + 24 * stride, sub(w79[0], w143[1])); \
+ store(output + 25 * stride, \
+ add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
+ store(output + 26 * stride, \
+ add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
+ store(output + 27 * stride, \
+ add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
+ store(output + 28 * stride, \
+ add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
+ store(output + 29 * stride, \
+ add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
+ store(output + 30 * stride, \
+ add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
+ store(output + 31 * stride, \
+ add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
+ }
+
+#endif // AOM_DSP_FFT_COMMON_H_
diff --git a/aom_dsp/noise_model.c b/aom_dsp/noise_model.c
index 117185caf..6c0cf62df 100644
--- a/aom_dsp/noise_model.c
+++ b/aom_dsp/noise_model.c
@@ -18,6 +18,7 @@
#include "aom_dsp/noise_model.h"
#include "aom_dsp/noise_util.h"
#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
#include "av1/encoder/mathutils.h"
#define kLowPolyNumParams 3
@@ -1268,3 +1269,192 @@ int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
film_grain->overlap_flag = 1;
return 1;
}
+
+static void pointwise_multiply(const float *a, float *b, int n) {
+ for (int i = 0; i < n; ++i) {
+ b[i] *= a[i];
+ }
+}
+
+static float *get_half_cos_window(int block_size) {
+ float *window_function =
+ (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
+ for (int y = 0; y < block_size; ++y) {
+ const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
+ for (int x = 0; x < block_size; ++x) {
+ const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
+ window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
+ }
+ }
+ return window_function;
+}
+
+#define DITHER_AND_QUANTIZE(INT_TYPE, suffix) \
+ static void dither_and_quantize_##suffix( \
+ float *result, int result_stride, INT_TYPE *denoised, int w, int h, \
+ int stride, int chroma_sub_w, int chroma_sub_h, int block_size, \
+ float block_normalization) { \
+ for (int y = 0; y < (h >> chroma_sub_h); ++y) { \
+ for (int x = 0; x < (w >> chroma_sub_w); ++x) { \
+ const int result_idx = \
+ (y + (block_size >> chroma_sub_h)) * result_stride + x + \
+ (block_size >> chroma_sub_w); \
+ INT_TYPE new_val = (INT_TYPE)AOMMIN( \
+ AOMMAX(result[result_idx] * block_normalization + 0.5f, 0), \
+ block_normalization); \
+ const float err = \
+ -(((float)new_val) / block_normalization - result[result_idx]); \
+ denoised[y * stride + x] = new_val; \
+ if (x + 1 < (w >> chroma_sub_w)) { \
+ result[result_idx + 1] += err * 7.0f / 16.0f; \
+ } \
+ if (y + 1 < (h >> chroma_sub_h)) { \
+ if (x > 0) { \
+ result[result_idx + result_stride - 1] += err * 3.0f / 16.0f; \
+ } \
+ result[result_idx + result_stride] += err * 5.0f / 16.0f; \
+ if (x + 1 < (w >> chroma_sub_w)) { \
+ result[result_idx + result_stride + 1] += err * 1.0f / 16.0f; \
+ } \
+ } \
+ } \
+ } \
+ }
+
+DITHER_AND_QUANTIZE(uint8_t, lowbd);
+DITHER_AND_QUANTIZE(uint16_t, highbd);
+
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+ int w, int h, int stride[3], int chroma_sub[2],
+ float *noise_psd[3], int block_size, int bit_depth,
+ int use_highbd) {
+ float *plane = NULL, *block = NULL, *window_full = NULL,
+ *window_chroma = NULL;
+ double *block_d = NULL, *plane_d = NULL;
+ struct aom_noise_tx_t *tx_full = NULL;
+ struct aom_noise_tx_t *tx_chroma = NULL;
+ const int num_blocks_w = (w + block_size - 1) / block_size;
+ const int num_blocks_h = (h + block_size - 1) / block_size;
+ const int result_stride = (num_blocks_w + 2) * block_size;
+ const int result_height = (num_blocks_h + 2) * block_size;
+ float *result = NULL;
+ int init_success = 1;
+ aom_flat_block_finder_t block_finder_full;
+ aom_flat_block_finder_t block_finder_chroma;
+ const float kBlockNormalization = (1 << bit_depth) - 1;
+ if (chroma_sub[0] != chroma_sub[1]) {
+ fprintf(stderr,
+ "aom_wiener_denoise_2d doesn't handle different chroma "
+ "subsampling");
+ return 0;
+ }
+ init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
+ bit_depth, use_highbd);
+ result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
+ sizeof(*result));
+ plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
+ block =
+ (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
+ block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
+ plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
+ window_full = get_half_cos_window(block_size);
+ tx_full = aom_noise_tx_malloc(block_size);
+
+ if (chroma_sub[0] != 0) {
+ init_success &= aom_flat_block_finder_init(&block_finder_chroma,
+ block_size >> chroma_sub[0],
+ bit_depth, use_highbd);
+ window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
+ tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
+ } else {
+ window_chroma = window_full;
+ tx_chroma = tx_full;
+ }
+
+ init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
+ (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
+ (window_full != NULL) && (window_chroma != NULL) &&
+ (result != NULL);
+ for (int c = init_success ? 0 : 3; c < 3; ++c) {
+ float *window_function = c == 0 ? window_full : window_chroma;
+ aom_flat_block_finder_t *block_finder = &block_finder_full;
+ const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
+ const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
+ struct aom_noise_tx_t *tx =
+ (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
+ if (!data[c] || !denoised[c]) continue;
+ if (c > 0 && chroma_sub[0] != 0) {
+ block_finder = &block_finder_chroma;
+ }
+ memset(result, 0, sizeof(*result) * result_stride * result_height);
+ // Do overlapped block processing (half overlapped). The block rows can
+ // easily be done in parallel
+ for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
+ offsy += (block_size >> chroma_sub_h) / 2) {
+ for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
+ offsx += (block_size >> chroma_sub_w) / 2) {
+ // Pad the boundary when processing each block-set.
+ for (int by = -1; by < num_blocks_h; ++by) {
+ for (int bx = -1; bx < num_blocks_w; ++bx) {
+ const int pixels_per_block =
+ (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
+ aom_flat_block_finder_extract_block(
+ block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
+ stride[c], bx * (block_size >> chroma_sub_w) + offsx,
+ by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
+ for (int j = 0; j < pixels_per_block; ++j) {
+ block[j] = (float)block_d[j];
+ plane[j] = (float)plane_d[j];
+ }
+ pointwise_multiply(window_function, block, pixels_per_block);
+ aom_noise_tx_forward(tx, block);
+ aom_noise_tx_filter(tx, noise_psd[c]);
+ aom_noise_tx_inverse(tx, block);
+
+ // Apply window function to the plane approximation (we will apply
+ // it to the sum of plane + block when composing the results).
+ pointwise_multiply(window_function, plane, pixels_per_block);
+
+ for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
+ const int y_result =
+ y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
+ for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
+ const int x_result =
+ x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
+ result[y_result * result_stride + x_result] +=
+ (block[y * (block_size >> chroma_sub_w) + x] +
+ plane[y * (block_size >> chroma_sub_w) + x]) *
+ window_function[y * (block_size >> chroma_sub_w) + x];
+ }
+ }
+ }
+ }
+ }
+ }
+ if (use_highbd) {
+ dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
+ w, h, stride[c], chroma_sub_w, chroma_sub_h,
+ block_size, kBlockNormalization);
+ } else {
+ dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
+ stride[c], chroma_sub_w, chroma_sub_h,
+ block_size, kBlockNormalization);
+ }
+ }
+ aom_free(result);
+ aom_free(plane);
+ aom_free(block);
+ aom_free(plane_d);
+ aom_free(block_d);
+ aom_free(window_full);
+
+ aom_noise_tx_free(tx_full);
+
+ aom_flat_block_finder_free(&block_finder_full);
+ if (chroma_sub[0] != 0) {
+ aom_flat_block_finder_free(&block_finder_chroma);
+ aom_free(window_chroma);
+ aom_noise_tx_free(tx_chroma);
+ }
+ return init_success;
+}
diff --git a/aom_dsp/noise_model.h b/aom_dsp/noise_model.h
index 34a364de0..dabeacc14 100644
--- a/aom_dsp/noise_model.h
+++ b/aom_dsp/noise_model.h
@@ -261,6 +261,25 @@ void aom_noise_model_save_latest(aom_noise_model_t *noise_model);
int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
aom_film_grain_t *film_grain);
+/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
+ *
+ * \param[in] data Raw frame data
+ * \param[out] denoised Denoised frame data
+ * \param[in] w Frame width
+ * \param[in] h Frame height
+ * \param[in] stride Stride of the planes
+ * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in] noise_psd The power spectral density of the noise
+ * \param[in] block_size The size of blocks
+ * \param[in] bit_depth Bit depth of the image
+ * \param[in] use_highbd If true, uint8 pointers are interpreted as
+ * uint16 and stride is measured in uint16.
+ * This must be true when bit_depth >= 10.
+ */
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+ int w, int h, int stride[3], int chroma_sub_log2[2],
+ float *noise_psd[3], int block_size, int bit_depth,
+ int use_highbd);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
diff --git a/aom_dsp/noise_util.c b/aom_dsp/noise_util.c
index c81874c6e..db4ca07c7 100644
--- a/aom_dsp/noise_util.c
+++ b/aom_dsp/noise_util.c
@@ -13,9 +13,120 @@
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include "aom_dsp/noise_util.h"
+#include "aom_dsp/fft_common.h"
#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+float aom_noise_psd_get_default_value(int block_size, float factor) {
+ return (factor * factor / 10000) * block_size * block_size / 8;
+}
+
+// Internal representation of noise transform. It keeps track of the
+// transformed data and a temporary working buffer to use during the
+// transform.
+struct aom_noise_tx_t {
+ float *tx_block;
+ float *temp;
+ int block_size;
+ void (*fft)(const float *, float *, float *);
+ void (*ifft)(const float *, float *, float *);
+};
+
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
+ struct aom_noise_tx_t *noise_tx =
+ (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
+ if (!noise_tx) return NULL;
+ memset(noise_tx, 0, sizeof(*noise_tx));
+ switch (block_size) {
+ case 2:
+ noise_tx->fft = aom_fft2x2_float;
+ noise_tx->ifft = aom_ifft2x2_float;
+ break;
+ case 4:
+ noise_tx->fft = aom_fft4x4_float;
+ noise_tx->ifft = aom_ifft4x4_float;
+ break;
+ case 8:
+ noise_tx->fft = aom_fft8x8_float;
+ noise_tx->ifft = aom_ifft8x8_float;
+ break;
+ case 16:
+ noise_tx->fft = aom_fft16x16_float;
+ noise_tx->ifft = aom_ifft16x16_float;
+ break;
+ case 32:
+ noise_tx->fft = aom_fft32x32_float;
+ noise_tx->ifft = aom_ifft32x32_float;
+ break;
+ default:
+ aom_free(noise_tx);
+ fprintf(stderr, "Unsupported block size %d\n", block_size);
+ return NULL;
+ }
+ noise_tx->block_size = block_size;
+ noise_tx->tx_block = (float *)aom_memalign(
+ 32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+ noise_tx->temp = (float *)aom_memalign(
+ 32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
+ if (!noise_tx->tx_block || !noise_tx->temp) {
+ aom_noise_tx_free(noise_tx);
+ return NULL;
+ }
+ return noise_tx;
+}
+
+void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
+ noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
+}
+
+void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
+ const int block_size = noise_tx->block_size;
+ const float kBeta = 1.1f;
+ const float kEps = 1e-6f;
+ for (int y = 0; y < block_size; ++y) {
+ for (int x = 0; x < block_size; ++x) {
+ int i = y * block_size + x;
+ float *c = noise_tx->tx_block + 2 * i;
+ const float p = c[0] * c[0] + c[1] * c[1];
+ if (p > kBeta * psd[i] && p > 1e-6) {
+ noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
+ noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
+ } else {
+ noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
+ noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
+ }
+ }
+ }
+}
+
+void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
+ const int n = noise_tx->block_size * noise_tx->block_size;
+ noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
+ for (int i = 0; i < n; ++i) {
+ data[i] /= n;
+ }
+}
+
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
+ float *psd) {
+ const int block_size = noise_tx->block_size;
+ for (int yb = 0; yb < block_size; ++yb) {
+ for (int xb = 0; xb <= block_size / 2; ++xb) {
+ float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
+ psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
+ }
+ }
+}
+
+void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
+ if (!noise_tx) return;
+ aom_free(noise_tx->tx_block);
+ aom_free(noise_tx->temp);
+ aom_free(noise_tx);
+}
double aom_normalized_cross_correlation(const double *a, const double *b,
int n) {
diff --git a/aom_dsp/noise_util.h b/aom_dsp/noise_util.h
index f3366d86b..ea4d9e3de 100644
--- a/aom_dsp/noise_util.h
+++ b/aom_dsp/noise_util.h
@@ -16,6 +16,44 @@
extern "C" {
#endif // __cplusplus
+// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
+// It is meant to be lightweight and does hold the transformed data (as
+// the user should not be manipulating the transformed data directly).
+struct aom_noise_tx_t;
+
+// Allocates and returns a aom_noise_tx_t useful for denoising the given
+// block_size. The resulting aom_noise_tx_t should be free'd with
+// aom_noise_tx_free.
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
+void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
+
+// Transforms the internal data and holds it in the aom_noise_tx's internal
+// buffer. For compatibility with existing SIMD implementations, "data" must
+// be 32-byte aligned.
+void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
+ const float *data);
+
+// Filters aom_noise_tx's internal data using the provided noise power spectral
+// density. The PSD must be at least block_size * block_size and should be
+// populated with a constant or via estimates taken from
+// aom_noise_tx_add_energy.
+void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
+
+// Performs an inverse transform using the internal transform data.
+// For compatibility with existing SIMD implementations, "data" must be 32-byte
+// aligned.
+void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
+
+// Aggregates the power of the buffered transform data into the psd buffer.
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
+ float *psd);
+
+// Returns a default value suitable for denosing a transform of the given
+// block_size. The noise "factor" determines the strength of the noise to
+// be removed. A value of about 2.5 can be used for moderate denoising,
+// where a value of 5.0 can be used for a high level of denoising.
+float aom_noise_psd_get_default_value(int block_size, float factor);
+
// Computes normalized cross correlation of two vectors a and b of length n.
double aom_normalized_cross_correlation(const double *a, const double *b,
int n);
diff --git a/aom_dsp/x86/fft_avx2.c b/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 000000000..54da02253
--- /dev/null
+++ b/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+ int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+ aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+ aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+ aom_transpose_float_sse2, 8);
+}
diff --git a/aom_dsp/x86/fft_sse2.c b/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 000000000..12bdc3e18
--- /dev/null
+++ b/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+ const int ldb) {
+ __m128 row1 = _mm_load_ps(&A[0 * lda]);
+ __m128 row2 = _mm_load_ps(&A[1 * lda]);
+ __m128 row3 = _mm_load_ps(&A[2 * lda]);
+ __m128 row4 = _mm_load_ps(&A[3 * lda]);
+ _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+ _mm_store_ps(&B[0 * ldb], row1);
+ _mm_store_ps(&B[1 * ldb], row2);
+ _mm_store_ps(&B[2 * ldb], row3);
+ _mm_store_ps(&B[3 * ldb], row4);
+}
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+ for (int y = 0; y < n; y += 4) {
+ for (int x = 0; x < n; x += 4) {
+ transpose4x4(A + y * n + x, B + x * n + y, n, n);
+ }
+ }
+}
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+ const int n2 = n / 2;
+ output[0] = packed[0];
+ output[1] = 0;
+ output[2 * (n2 * n)] = packed[n2 * n];
+ output[2 * (n2 * n) + 1] = 0;
+
+ output[2 * n2] = packed[n2];
+ output[2 * n2 + 1] = 0;
+ output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+ output[2 * (n2 * n + n2) + 1] = 0;
+
+ for (int c = 1; c < n2; ++c) {
+ output[2 * (0 * n + c)] = packed[c];
+ output[2 * (0 * n + c) + 1] = packed[c + n2];
+ output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+ output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+ }
+ for (int r = 1; r < n2; ++r) {
+ output[2 * (r * n + 0)] = packed[r * n];
+ output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+ output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+ output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+ for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+ output[2 * (r * n + c)] =
+ packed[r * n + c] - packed[(r + n2) * n + c + n2];
+ output[2 * (r * n + c) + 1] =
+ packed[(r + n2) * n + c] + packed[r * n + c + n2];
+ }
+
+ for (int c = 4; c < n2; c += 4) {
+ __m128 real1 = _mm_load_ps(packed + r * n + c);
+ __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+ __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+ __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+ real1 = _mm_sub_ps(real1, real2);
+ imag1 = _mm_add_ps(imag1, imag2);
+ _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+ _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+ }
+
+ int r2 = r + n2;
+ int r3 = n - r2;
+ output[2 * (r2 * n + 0)] = packed[r3 * n];
+ output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+ output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+ output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+ for (int c = 1; c < AOMMIN(4, n2); ++c) {
+ output[2 * (r2 * n + c)] =
+ packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+ output[2 * (r2 * n + c) + 1] =
+ -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+ }
+ for (int c = 4; c < n2; c += 4) {
+ __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+ __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+ __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+ __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+ real1 = _mm_add_ps(real1, real2);
+ imag1 = _mm_sub_ps(imag2, imag1);
+ _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+ _mm_store_ps(output + 2 * (r2 * n + c + 2),
+ _mm_unpackhi_ps(real1, imag1));
+ }
+ }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+ aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+ aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+ aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+ aom_transpose_float_sse2, 4);
+}