summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNeil Birkbeck <birkbeck@google.com>2018-06-12 08:03:48 -0700
committerYaowu Xu <yaowu@google.com>2018-06-15 17:01:39 +0000
commit04721792760c4cdf52b39b579391441a0927e349 (patch)
tree43b59ebde47ccbf66b4463709356cc8f3229805f
parent58f109babf7af8bfe5f0f9b50ae78da4fe7f809a (diff)
Add 2d wiener denoiser for regrainer
This also adds support for 2d fft (float) to be used for both denoising and noise power spectral density estimation. Change-Id: Ie95b44280bb301dfd3f0cf06d139e307d2f4e11b
-rw-r--r--aom_dsp/aom_dsp.cmake4
-rwxr-xr-xaom_dsp/aom_dsp_rtcd_defs.pl28
-rw-r--r--aom_dsp/fft.c219
-rw-r--r--aom_dsp/fft_common.h1050
-rw-r--r--aom_dsp/noise_model.c190
-rw-r--r--aom_dsp/noise_model.h19
-rw-r--r--aom_dsp/noise_util.c111
-rw-r--r--aom_dsp/noise_util.h38
-rw-r--r--aom_dsp/x86/fft_avx2.c73
-rw-r--r--aom_dsp/x86/fft_sse2.c166
-rw-r--r--test/fft_test.cc253
-rw-r--r--test/noise_model_test.cc230
-rw-r--r--test/test.cmake1
13 files changed, 2382 insertions, 0 deletions
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index a49177906..768875f7d 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -26,6 +26,8 @@ list(APPEND AOM_DSP_COMMON_SOURCES
"${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
"${AOM_ROOT}/aom_dsp/entcode.c"
"${AOM_ROOT}/aom_dsp/entcode.h"
+ "${AOM_ROOT}/aom_dsp/fft.c"
+ "${AOM_ROOT}/aom_dsp/fft_common.h"
"${AOM_ROOT}/aom_dsp/intrapred.c"
"${AOM_ROOT}/aom_dsp/intrapred_common.h"
"${AOM_ROOT}/aom_dsp/loopfilter.c"
@@ -54,6 +56,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
"${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
"${AOM_ROOT}/aom_dsp/x86/convolve.h"
"${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
@@ -81,6 +84,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
"${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
"${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
+ "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c")
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 0b23b954b..38084d4ae 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -484,6 +484,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/aom_highbd_fdct8x8 sse2/;
+ # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
+ add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
+
+ add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft4x4_float sse2/;
+
+ add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft8x8_float avx2 sse2/;
+
+ add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft16x16_float avx2 sse2/;
+
+ add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft32x32_float avx2 sse2/;
+
+ add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
+
+ add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft4x4_float sse2/;
+
+ add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft8x8_float avx2 sse2/;
+
+ add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft16x16_float avx2 sse2/;
+
+ add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft32x32_float avx2 sse2/;
} # CONFIG_AV1_ENCODER
#
diff --git a/aom_dsp/fft.c b/aom_dsp/fft.c
new file mode 100644
index 000000000..0ba71cfb3
--- /dev/null
+++ b/aom_dsp/fft.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void simple_transpose(const float *A, float *B, int n) {
+ for (int y = 0; y < n; y++) {
+ for (int x = 0; x < n; x++) {
+ B[y * n + x] = A[x * n + y];
+ }
+ }
+}
+
+// The 1d transform is real to complex and packs the complex results in
+// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
+// components, followed by the n/2 - 1 imaginary components). After the
+// transform is done on the rows, the first n/2 + 1 columns are real, and
+// the remaining are the imaginary components. After the transform on the
+// columns, the region of [0, n/2]x[0, n/2] contains the real part of
+// fft of the real columns. The real part of the 2d fft also includes the
+// imaginary part of transformed imaginary columns. This function assembles
+// the correct outputs while putting the real and imaginary components
+// next to each other.
+static INLINE void unpack_2d_output(const float *col_fft, float *output,
+ int n) {
+ for (int y = 0; y <= n / 2; ++y) {
+ const int y2 = y + n / 2;
+ const int y_extra = y2 > n / 2 && y2 < n;
+
+ for (int x = 0; x <= n / 2; ++x) {
+ const int x2 = x + n / 2;
+ const int x_extra = x2 > n / 2 && x2 < n;
+ output[2 * (y * n + x)] =
+ col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+ output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
+ (x_extra ? col_fft[y * n + x2] : 0);
+ if (y_extra) {
+ output[2 * ((n - y) * n + x)] =
+ col_fft[y * n + x] +
+ (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+ output[2 * ((n - y) * n + x) + 1] =
+ -(y_extra ? col_fft[y2 * n + x] : 0) +
+ (x_extra ? col_fft[y * n + x2] : 0);
+ }
+ }
+ }
+}
+
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+ aom_fft_unpack_func_t unpack, int vec_size) {
+ for (int x = 0; x < n; x += vec_size) {
+ tform(input + x, output + x, n);
+ }
+ transpose(output, temp, n);
+
+ for (int x = 0; x < n; x += vec_size) {
+ tform(temp + x, output + x, n);
+ }
+ transpose(output, temp, n);
+
+ unpack(temp, output, n);
+}
+
+static INLINE void store_float(float *output, float input) { *output = input; }
+static INLINE float add_float(float a, float b) { return a + b; }
+static INLINE float sub_float(float a, float b) { return a - b; }
+static INLINE float mul_float(float a, float b) { return a * b; }
+
+GEN_FFT_2(void, float, float, float, *, store_float);
+GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
+ sub_float);
+GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+
+void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+ aom_fft_1d_func_t ifft_multi,
+ aom_fft_transpose_func_t transpose, int vec_size) {
+ // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
+ // and get real outputs.
+ for (int y = 0; y <= n / 2; ++y) {
+ output[y * n] = input[2 * y * n];
+ output[y * n + 1] = input[2 * (y * n + n / 2)];
+ }
+ for (int y = n / 2 + 1; y < n; ++y) {
+ output[y * n] = input[2 * (y - n / 2) * n + 1];
+ output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
+ }
+
+ for (int i = 0; i < 2; i += vec_size) {
+ ifft_multi(output + i, temp + i, n);
+ }
+
+ // For the other columns, since we don't have a full ifft for complex inputs
+ // we have to split them into the real and imaginary counterparts.
+ // Pack the real component, then the imaginary components.
+ for (int y = 0; y < n; ++y) {
+ for (int x = 1; x < n / 2; ++x) {
+ output[y * n + (x + 1)] = input[2 * (y * n + x)];
+ }
+ for (int x = 1; x < n / 2; ++x) {
+ output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
+ }
+ }
+ for (int y = 2; y < vec_size; y++) {
+ fft_single(output + y, temp + y, n);
+ }
+ // This is the part that can be sped up with SIMD
+ for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
+ fft_multi(output + y, temp + y, n);
+ }
+
+ // Put the 0 and n/2 th results in the correct place.
+ for (int x = 0; x < n; ++x) {
+ output[x] = temp[x * n];
+ output[(n / 2) * n + x] = temp[x * n + 1];
+ }
+ // This rearranges and transposes.
+ for (int y = 1; y < n / 2; ++y) {
+ // Fill in the real columns
+ for (int x = 0; x <= n / 2; ++x) {
+ output[x + y * n] =
+ temp[(y + 1) + x * n] +
+ ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
+ }
+ for (int x = n / 2 + 1; x < n; ++x) {
+ output[x + y * n] = temp[(y + 1) + (n - x) * n] -
+ temp[(y + n / 2) + ((n - x) + n / 2) * n];
+ }
+ // Fill in the imag columns
+ for (int x = 0; x <= n / 2; ++x) {
+ output[x + (y + n / 2) * n] =
+ temp[(y + n / 2) + x * n] -
+ ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
+ }
+ for (int x = n / 2 + 1; x < n; ++x) {
+ output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
+ temp[(y + n / 2) + (n - x) * n];
+ }
+ }
+ for (int y = 0; y < n; y += vec_size) {
+ ifft_multi(output + y, temp + y, n);
+ }
+ transpose(temp, output, n);
+}
+
+GEN_IFFT_2(void, float, float, float, *, store_float);
+GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
+ sub_float);
+GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float);
+
+void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
+ aom_ifft1d_2_float, simple_transpose, 1);
+}
+
+void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
+ aom_ifft1d_4_float, simple_transpose, 1);
+}
+
+void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
+ aom_ifft1d_8_float, simple_transpose, 1);
+}
+
+void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
+}
+
+void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
+}
diff --git a/aom_dsp/fft_common.h b/aom_dsp/fft_common.h
new file mode 100644
index 000000000..2f3cd5fdc
--- /dev/null
+++ b/aom_dsp/fft_common.h
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_FFT_COMMON_H_
+#define AOM_DSP_FFT_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief A function pointer for computing 1d fft and ifft.
+ *
+ * The function will point to an implementation for a specific transform size,
+ * and may perform the transforms using vectorized instructions.
+ *
+ * For a non-vectorized forward transforms of size n, the input and output
+ * buffers will be size n. The output takes advantage of conjugate symmetry and
+ * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
+ * (r_{j}, i_{j}) is the complex output for index j.
+ *
+ * An inverse transform will assume that the complex "input" is packed
+ * similarly. Its output will be real.
+ *
+ * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
+ *
+ * Vectorized implementations are parallelized along the columns so that the fft
+ * can be performed on multiple columns at a time. In such cases the data block
+ * for input and output is typically square (n x n) and the stride will
+ * correspond to the spacing between rows. At minimum, the input size must be
+ * n x simd_vector_length.
+ *
+ * \param[in] input Input buffer. See above for size restrictions.
+ * \param[out] output Output buffer. See above for size restrictions.
+ * \param[in] stride The spacing in number of elements between rows
+ * (or elements)
+ */
+typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
+ int stride);
+
+// Declare some of the forward non-vectorized transforms which are used in some
+// of the vectorized implementations
+void aom_fft1d_4_float(const float *input, float *output, int stride);
+void aom_fft1d_8_float(const float *input, float *output, int stride);
+void aom_fft1d_16_float(const float *input, float *output, int stride);
+void aom_fft1d_32_float(const float *input, float *output, int stride);
+
+/**\!brief Function pointer for transposing a matrix of floats.
+ *
+ * \param[in] input Input buffer (size n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in] n Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
+ int n);
+
+/**\!brief Function pointer for re-arranging intermediate 2d transform results.
+ *
+ * After re-arrangement, the real and imaginary components will be packed
+ * tightly next to each other.
+ *
+ * \param[in] input Input buffer (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in] n Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
+
+/*!\brief Performs a 2d fft with the given functions.
+ *
+ * This generator function allows for multiple different implementations of 2d
+ * fft with different vector operations, without having to redefine the main
+ * body multiple times.
+ *
+ * \param[in] input Input buffer to run the transform on (size n x n)
+ * \param[out] temp Working buffer for computing the transform (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in] tform Forward transform function
+ * \param[in] transpose Transpose function (for n x n matrix)
+ * \param[in] unpack Unpack function used to massage outputs to correct form
+ * \param[in] vec_size Vector size (the transform is done vec_size units at
+ * a time)
+ */
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+ aom_fft_unpack_func_t unpack, int vec_size);
+
+/*!\brief Perform a 2d inverse fft with the given helper functions
+ *
+ * \param[in] input Input buffer to run the transform on (size 2 x n x n)
+ * \param[out] temp Working buffer for computations (size 2 x n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in] fft_single Forward transform function (non vectorized)
+ * \param[in] fft_multi Forward transform function (vectorized)
+ * \param[in] ifft_multi Inverse transform function (vectorized)
+ * \param[in] transpose Transpose function (for n x n matrix)
+ * \param[in] vec_size Vector size (the transform is done vec_size
+ * units at a time)
+ */
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+ aom_fft_1d_func_t ifft_multi,
+ aom_fft_transpose_func_t transpose, int vec_size);
+#ifdef __cplusplus
+}
+#endif
+
+// The macros below define 1D fft/ifft for different data types and for
+// different simd vector intrinsic types.
+
+#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \
+ ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ store(output + 0 * stride, i0 + i1); \
+ store(output + 1 * stride, i0 - i1); \
+ }
+
+#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+ ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC w0 = add(i0, i2); \
+ const T_VEC w1 = sub(i0, i2); \
+ const T_VEC w2 = add(i1, i3); \
+ const T_VEC w3 = sub(i1, i3); \
+ store(output + 0 * stride, add(w0, w2)); \
+ store(output + 1 * stride, w1); \
+ store(output + 2 * stride, sub(w0, w2)); \
+ store(output + 3 * stride, sub(kWeight0, w3)); \
+ }
+
+#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
+ ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC w0 = add(i0, i4); \
+ const T_VEC w1 = sub(i0, i4); \
+ const T_VEC w2 = add(i2, i6); \
+ const T_VEC w3 = sub(i2, i6); \
+ const T_VEC w4 = add(w0, w2); \
+ const T_VEC w5 = sub(w0, w2); \
+ const T_VEC w7 = add(i1, i5); \
+ const T_VEC w8 = sub(i1, i5); \
+ const T_VEC w9 = add(i3, i7); \
+ const T_VEC w10 = sub(i3, i7); \
+ const T_VEC w11 = add(w7, w9); \
+ const T_VEC w12 = sub(w7, w9); \
+ store(output + 0 * stride, add(w4, w11)); \
+ store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \
+ store(output + 2 * stride, w5); \
+ store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \
+ store(output + 4 * stride, sub(w4, w11)); \
+ store(output + 5 * stride, \
+ sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \
+ store(output + 6 * stride, sub(kWeight0, w12)); \
+ store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \
+ }
+
+#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC w0 = add(i0, i8); \
+ const T_VEC w1 = sub(i0, i8); \
+ const T_VEC w2 = add(i4, i12); \
+ const T_VEC w3 = sub(i4, i12); \
+ const T_VEC w4 = add(w0, w2); \
+ const T_VEC w5 = sub(w0, w2); \
+ const T_VEC w7 = add(i2, i10); \
+ const T_VEC w8 = sub(i2, i10); \
+ const T_VEC w9 = add(i6, i14); \
+ const T_VEC w10 = sub(i6, i14); \
+ const T_VEC w11 = add(w7, w9); \
+ const T_VEC w12 = sub(w7, w9); \
+ const T_VEC w14 = add(w4, w11); \
+ const T_VEC w15 = sub(w4, w11); \
+ const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(sub(kWeight0, w3), \
+ mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(w3, mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w19 = add(i1, i9); \
+ const T_VEC w20 = sub(i1, i9); \
+ const T_VEC w21 = add(i5, i13); \
+ const T_VEC w22 = sub(i5, i13); \
+ const T_VEC w23 = add(w19, w21); \
+ const T_VEC w24 = sub(w19, w21); \
+ const T_VEC w26 = add(i3, i11); \
+ const T_VEC w27 = sub(i3, i11); \
+ const T_VEC w28 = add(i7, i15); \
+ const T_VEC w29 = sub(i7, i15); \
+ const T_VEC w30 = add(w26, w28); \
+ const T_VEC w31 = sub(w26, w28); \
+ const T_VEC w33 = add(w23, w30); \
+ const T_VEC w34 = sub(w23, w30); \
+ const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(sub(kWeight0, w22), \
+ mul(kWeight2, add(w29, w27))) }; \
+ const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(w22, mul(kWeight2, add(w29, w27))) }; \
+ store(output + 0 * stride, add(w14, w33)); \
+ store(output + 1 * stride, \
+ add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
+ store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \
+ store(output + 3 * stride, \
+ add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
+ store(output + 4 * stride, w15); \
+ store(output + 5 * stride, \
+ add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \
+ mul(kWeight3, w37[1])))); \
+ store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \
+ store(output + 7 * stride, \
+ add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \
+ mul(kWeight4, w35[1])))); \
+ store(output + 8 * stride, sub(w14, w33)); \
+ store(output + 9 * stride, \
+ add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
+ store(output + 10 * stride, \
+ sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \
+ store(output + 11 * stride, \
+ add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
+ store(output + 12 * stride, sub(kWeight0, w34)); \
+ store(output + 13 * stride, \
+ sub(sub(kWeight0, w18[1]), \
+ sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \
+ store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \
+ store(output + 15 * stride, \
+ sub(sub(kWeight0, w16[1]), \
+ sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \
+ }
+
+#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC kWeight5 = constant(0.980785f); \
+ const T_VEC kWeight6 = constant(0.19509f); \
+ const T_VEC kWeight7 = constant(0.83147f); \
+ const T_VEC kWeight8 = constant(0.55557f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC i16 = load(input + 16 * stride); \
+ const T_VEC i17 = load(input + 17 * stride); \
+ const T_VEC i18 = load(input + 18 * stride); \
+ const T_VEC i19 = load(input + 19 * stride); \
+ const T_VEC i20 = load(input + 20 * stride); \
+ const T_VEC i21 = load(input + 21 * stride); \
+ const T_VEC i22 = load(input + 22 * stride); \
+ const T_VEC i23 = load(input + 23 * stride); \
+ const T_VEC i24 = load(input + 24 * stride); \
+ const T_VEC i25 = load(input + 25 * stride); \
+ const T_VEC i26 = load(input + 26 * stride); \
+ const T_VEC i27 = load(input + 27 * stride); \
+ const T_VEC i28 = load(input + 28 * stride); \
+ const T_VEC i29 = load(input + 29 * stride); \
+ const T_VEC i30 = load(input + 30 * stride); \
+ const T_VEC i31 = load(input + 31 * stride); \
+ const T_VEC w0 = add(i0, i16); \
+ const T_VEC w1 = sub(i0, i16); \
+ const T_VEC w2 = add(i8, i24); \
+ const T_VEC w3 = sub(i8, i24); \
+ const T_VEC w4 = add(w0, w2); \
+ const T_VEC w5 = sub(w0, w2); \
+ const T_VEC w7 = add(i4, i20); \
+ const T_VEC w8 = sub(i4, i20); \
+ const T_VEC w9 = add(i12, i28); \
+ const T_VEC w10 = sub(i12, i28); \
+ const T_VEC w11 = add(w7, w9); \
+ const T_VEC w12 = sub(w7, w9); \
+ const T_VEC w14 = add(w4, w11); \
+ const T_VEC w15 = sub(w4, w11); \
+ const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(sub(kWeight0, w3), \
+ mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(w3, mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w19 = add(i2, i18); \
+ const T_VEC w20 = sub(i2, i18); \
+ const T_VEC w21 = add(i10, i26); \
+ const T_VEC w22 = sub(i10, i26); \
+ const T_VEC w23 = add(w19, w21); \
+ const T_VEC w24 = sub(w19, w21); \
+ const T_VEC w26 = add(i6, i22); \
+ const T_VEC w27 = sub(i6, i22); \
+ const T_VEC w28 = add(i14, i30); \
+ const T_VEC w29 = sub(i14, i30); \
+ const T_VEC w30 = add(w26, w28); \
+ const T_VEC w31 = sub(w26, w28); \
+ const T_VEC w33 = add(w23, w30); \
+ const T_VEC w34 = sub(w23, w30); \
+ const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(sub(kWeight0, w22), \
+ mul(kWeight2, add(w29, w27))) }; \
+ const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(w22, mul(kWeight2, add(w29, w27))) }; \
+ const T_VEC w38 = add(w14, w33); \
+ const T_VEC w39 = sub(w14, w33); \
+ const T_VEC w40[2] = { \
+ add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \
+ add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \
+ }; \
+ const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \
+ sub(sub(kWeight0, w12), \
+ mul(kWeight2, add(w31, w24))) }; \
+ const T_VEC w42[2] = { \
+ add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \
+ add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \
+ }; \
+ const T_VEC w44[2] = { \
+ add(w18[0], \
+ sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
+ sub(sub(kWeight0, w18[1]), \
+ sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \
+ }; \
+ const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \
+ sub(w12, mul(kWeight2, add(w31, w24))) }; \
+ const T_VEC w46[2] = { \
+ add(w16[0], \
+ sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
+ sub(sub(kWeight0, w16[1]), \
+ sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \
+ }; \
+ const T_VEC w47 = add(i1, i17); \
+ const T_VEC w48 = sub(i1, i17); \
+ const T_VEC w49 = add(i9, i25); \
+ const T_VEC w50 = sub(i9, i25); \
+ const T_VEC w51 = add(w47, w49); \
+ const T_VEC w52 = sub(w47, w49); \
+ const T_VEC w54 = add(i5, i21); \
+ const T_VEC w55 = sub(i5, i21); \
+ const T_VEC w56 = add(i13, i29); \
+ const T_VEC w57 = sub(i13, i29); \
+ const T_VEC w58 = add(w54, w56); \
+ const T_VEC w59 = sub(w54, w56); \
+ const T_VEC w61 = add(w51, w58); \
+ const T_VEC w62 = sub(w51, w58); \
+ const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \
+ sub(sub(kWeight0, w50), \
+ mul(kWeight2, add(w57, w55))) }; \
+ const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \
+ sub(w50, mul(kWeight2, add(w57, w55))) }; \
+ const T_VEC w66 = add(i3, i19); \
+ const T_VEC w67 = sub(i3, i19); \
+ const T_VEC w68 = add(i11, i27); \
+ const T_VEC w69 = sub(i11, i27); \
+ const T_VEC w70 = add(w66, w68); \
+ const T_VEC w71 = sub(w66, w68); \
+ const T_VEC w73 = add(i7, i23); \
+ const T_VEC w74 = sub(i7, i23); \
+ const T_VEC w75 = add(i15, i31); \
+ const T_VEC w76 = sub(i15, i31); \
+ const T_VEC w77 = add(w73, w75); \
+ const T_VEC w78 = sub(w73, w75); \
+ const T_VEC w80 = add(w70, w77); \
+ const T_VEC w81 = sub(w70, w77); \
+ const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \
+ sub(sub(kWeight0, w69), \
+ mul(kWeight2, add(w76, w74))) }; \
+ const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \
+ sub(w69, mul(kWeight2, add(w76, w74))) }; \
+ const T_VEC w85 = add(w61, w80); \
+ const T_VEC w86 = sub(w61, w80); \
+ const T_VEC w87[2] = { \
+ add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \
+ add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \
+ }; \
+ const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \
+ sub(sub(kWeight0, w59), \
+ mul(kWeight2, add(w78, w71))) }; \
+ const T_VEC w89[2] = { \
+ add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \
+ add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \
+ }; \
+ const T_VEC w91[2] = { \
+ add(w65[0], \
+ sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
+ sub(sub(kWeight0, w65[1]), \
+ sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \
+ }; \
+ const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \
+ sub(w59, mul(kWeight2, add(w78, w71))) }; \
+ const T_VEC w93[2] = { \
+ add(w63[0], \
+ sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
+ sub(sub(kWeight0, w63[1]), \
+ sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \
+ }; \
+ store(output + 0 * stride, add(w38, w85)); \
+ store(output + 1 * stride, \
+ add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \
+ store(output + 2 * stride, \
+ add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \
+ store(output + 3 * stride, \
+ add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \
+ store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \
+ store(output + 5 * stride, \
+ add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \
+ store(output + 6 * stride, \
+ add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \
+ store(output + 7 * stride, \
+ add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \
+ store(output + 8 * stride, w39); \
+ store(output + 9 * stride, \
+ add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \
+ mul(kWeight5, w93[1])))); \
+ store(output + 10 * stride, \
+ add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \
+ mul(kWeight3, w92[1])))); \
+ store(output + 11 * stride, \
+ add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \
+ mul(kWeight7, w91[1])))); \
+ store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \
+ store(output + 13 * stride, \
+ add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \
+ mul(kWeight8, w89[1])))); \
+ store(output + 14 * stride, \
+ add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \
+ mul(kWeight4, w88[1])))); \
+ store(output + 15 * stride, \
+ add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \
+ mul(kWeight6, w87[1])))); \
+ store(output + 16 * stride, sub(w38, w85)); \
+ store(output + 17 * stride, \
+ add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \
+ store(output + 18 * stride, \
+ add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \
+ store(output + 19 * stride, \
+ add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \
+ store(output + 20 * stride, \
+ sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \
+ store(output + 21 * stride, \
+ add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \
+ store(output + 22 * stride, \
+ add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \
+ store(output + 23 * stride, \
+ add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \
+ store(output + 24 * stride, sub(kWeight0, w86)); \
+ store(output + 25 * stride, \
+ sub(sub(kWeight0, w46[1]), \
+ sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \
+ store(output + 26 * stride, \
+ sub(sub(kWeight0, w45[1]), \
+ sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \
+ store(output + 27 * stride, \
+ sub(sub(kWeight0, w44[1]), \
+ sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \
+ store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \
+ store(output + 29 * stride, \
+ sub(sub(kWeight0, w42[1]), \
+ sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \
+ store(output + 30 * stride, \
+ sub(sub(kWeight0, w41[1]), \
+ sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \
+ store(output + 31 * stride, \
+ sub(sub(kWeight0, w40[1]), \
+ sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \
+ }
+
+#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \
+ ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ store(output + 0 * stride, i0 + i1); \
+ store(output + 1 * stride, i0 - i1); \
+ }
+
+#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+ ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC w2 = add(i0, i2); \
+ const T_VEC w3 = sub(i0, i2); \
+ const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \
+ const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \
+ store(output + 0 * stride, add(w2, w4[0])); \
+ store(output + 1 * stride, add(w3, w5[1])); \
+ store(output + 2 * stride, sub(w2, w4[0])); \
+ store(output + 3 * stride, sub(w3, w5[1])); \
+ }
+
+#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC w6 = add(i0, i4); \
+ const T_VEC w7 = sub(i0, i4); \
+ const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \
+ const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \
+ const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \
+ const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \
+ const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \
+ const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \
+ const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \
+ const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \
+ const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \
+ const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \
+ const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \
+ const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \
+ const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \
+ const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \
+ store(output + 0 * stride, add(w10[0], w18[0])); \
+ store(output + 1 * stride, \
+ add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \
+ store(output + 2 * stride, add(w11[0], w19[1])); \
+ store(output + 3 * stride, \
+ sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
+ store(output + 4 * stride, sub(w10[0], w18[0])); \
+ store(output + 5 * stride, \
+ add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \
+ mul(kWeight2, w20[1])))); \
+ store(output + 6 * stride, sub(w11[0], w19[1])); \
+ store(output + 7 * stride, \
+ add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
+ }
+
+#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC w14 = add(i0, i8); \
+ const T_VEC w15 = sub(i0, i8); \
+ const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \
+ const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \
+ const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \
+ const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \
+ const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \
+ const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \
+ const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \
+ const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \
+ const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \
+ const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \
+ const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \
+ const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \
+ const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \
+ const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \
+ const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \
+ const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \
+ const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \
+ add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
+ const T_VEC w33[2] = { add(w20[0], \
+ sub(sub(kWeight0, mul(kWeight2, w28[0])), \
+ mul(kWeight2, w28[1]))), \
+ add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
+ const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \
+ const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \
+ const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
+ sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+ const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
+ add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+ const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \
+ const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \
+ const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \
+ const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \
+ const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \
+ const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
+ const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
+ const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
+ const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \
+ const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \
+ const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \
+ const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \
+ const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \
+ const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \
+ const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \
+ const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \
+ const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \
+ const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \
+ const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \
+ add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
+ const T_VEC w57[2] = { add(w44[0], \
+ sub(sub(kWeight0, mul(kWeight2, w52[0])), \
+ mul(kWeight2, w52[1]))), \
+ add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
+ const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \
+ const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \
+ const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
+ sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+ const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
+ add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+ store(output + 0 * stride, add(w30[0], w54[0])); \
+ store(output + 1 * stride, \
+ add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \
+ store(output + 2 * stride, \
+ add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \
+ store(output + 3 * stride, \
+ add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \
+ store(output + 4 * stride, add(w31[0], w55[1])); \
+ store(output + 5 * stride, \
+ sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
+ store(output + 6 * stride, \
+ sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
+ store(output + 7 * stride, \
+ sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
+ store(output + 8 * stride, sub(w30[0], w54[0])); \
+ store(output + 9 * stride, \
+ add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \
+ mul(kWeight4, w56[1])))); \
+ store(output + 10 * stride, \
+ add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \
+ mul(kWeight2, w58[1])))); \
+ store(output + 11 * stride, \
+ add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \
+ mul(kWeight3, w60[1])))); \
+ store(output + 12 * stride, sub(w31[0], w55[1])); \
+ store(output + 13 * stride, \
+ add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
+ store(output + 14 * stride, \
+ add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
+ store(output + 15 * stride, \
+ add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
+ }
+#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC kWeight5 = constant(0.980785f); \
+ const T_VEC kWeight6 = constant(0.19509f); \
+ const T_VEC kWeight7 = constant(0.83147f); \
+ const T_VEC kWeight8 = constant(0.55557f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC i16 = load(input + 16 * stride); \
+ const T_VEC i17 = load(input + 17 * stride); \
+ const T_VEC i18 = load(input + 18 * stride); \
+ const T_VEC i19 = load(input + 19 * stride); \
+ const T_VEC i20 = load(input + 20 * stride); \
+ const T_VEC i21 = load(input + 21 * stride); \
+ const T_VEC i22 = load(input + 22 * stride); \
+ const T_VEC i23 = load(input + 23 * stride); \
+ const T_VEC i24 = load(input + 24 * stride); \
+ const T_VEC i25 = load(input + 25 * stride); \
+ const T_VEC i26 = load(input + 26 * stride); \
+ const T_VEC i27 = load(input + 27 * stride); \
+ const T_VEC i28 = load(input + 28 * stride); \
+ const T_VEC i29 = load(input + 29 * stride); \
+ const T_VEC i30 = load(input + 30 * stride); \
+ const T_VEC i31 = load(input + 31 * stride); \
+ const T_VEC w30 = add(i0, i16); \
+ const T_VEC w31 = sub(i0, i16); \
+ const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \
+ const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \
+ const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \
+ const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \
+ const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \
+ const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \
+ const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \
+ const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \
+ const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \
+ const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \
+ const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \
+ const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
+ const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
+ const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
+ const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \
+ const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \
+ const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \
+ add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \
+ const T_VEC w49[2] = { add(w36[0], \
+ sub(sub(kWeight0, mul(kWeight2, w44[0])), \
+ mul(kWeight2, w44[1]))), \
+ add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \
+ const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \
+ const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \
+ const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
+ sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
+ const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
+ add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
+ const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \
+ const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \
+ const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \
+ const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \
+ const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \
+ const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \
+ const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \
+ const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \
+ const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \
+ const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \
+ const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \
+ const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \
+ const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \
+ const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \
+ const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \
+ const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \
+ const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \
+ const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \
+ const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \
+ add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \
+ const T_VEC w73[2] = { add(w60[0], \
+ sub(sub(kWeight0, mul(kWeight2, w68[0])), \
+ mul(kWeight2, w68[1]))), \
+ add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \
+ const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \
+ const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \
+ const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
+ sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
+ const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
+ add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
+ const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \
+ const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \
+ const T_VEC w80[2] = { \
+ add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \
+ add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \
+ }; \
+ const T_VEC w81[2] = { \
+ add(w48[0], \
+ sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \
+ add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \
+ }; \
+ const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \
+ add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \
+ const T_VEC w83[2] = { add(w50[0], \
+ sub(sub(kWeight0, mul(kWeight2, w74[0])), \
+ mul(kWeight2, w74[1]))), \
+ add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \
+ const T_VEC w84[2] = { \
+ add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \
+ add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \
+ }; \
+ const T_VEC w85[2] = { \
+ add(w52[0], \
+ sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \
+ add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \
+ }; \
+ const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \
+ const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \
+ const T_VEC w88[2] = { \
+ sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
+ add(w49[1], \
+ sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \
+ }; \
+ const T_VEC w89[2] = { \
+ add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
+ add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \
+ }; \
+ const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
+ sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
+ const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
+ add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
+ const T_VEC w92[2] = { \
+ sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
+ add(w53[1], \
+ sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \
+ }; \
+ const T_VEC w93[2] = { \
+ add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
+ add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \
+ }; \
+ const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \
+ const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \
+ const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \
+ const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \
+ const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \
+ const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \
+ const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \
+ const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \
+ const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \
+ const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \
+ const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \
+ const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \
+ const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \
+ const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \
+ const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \
+ const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \
+ const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \
+ const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \
+ const T_VEC w112[2] = { \
+ add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \
+ add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \
+ }; \
+ const T_VEC w113[2] = { \
+ add(w100[0], \
+ sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
+ add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \
+ }; \
+ const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \
+ const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \
+ const T_VEC w116[2] = { \
+ sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
+ sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
+ }; \
+ const T_VEC w117[2] = { \
+ add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
+ add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
+ }; \
+ const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \
+ const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \
+ const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \
+ const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \
+ const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \
+ const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \
+ const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \
+ const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \
+ const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \
+ const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \
+ const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \
+ const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \
+ const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \
+ const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \
+ const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \
+ const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \
+ const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \
+ const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \
+ const T_VEC w136[2] = { \
+ add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \
+ add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \
+ }; \
+ const T_VEC w137[2] = { \
+ add(w124[0], \
+ sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
+ add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \
+ }; \
+ const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \
+ const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \
+ const T_VEC w140[2] = { \
+ sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
+ sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
+ }; \
+ const T_VEC w141[2] = { \
+ add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
+ add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
+ }; \
+ const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \
+ const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \
+ const T_VEC w144[2] = { \
+ add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \
+ add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \
+ }; \
+ const T_VEC w145[2] = { \
+ add(w112[0], \
+ sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
+ add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \
+ }; \
+ const T_VEC w146[2] = { \
+ add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \
+ add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \
+ }; \
+ const T_VEC w147[2] = { \
+ add(w114[0], \
+ sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
+ add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \
+ }; \
+ const T_VEC w148[2] = { \
+ add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \
+ add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \
+ }; \
+ const T_VEC w149[2] = { \
+ add(w116[0], \
+ sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
+ add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \
+ }; \
+ const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \
+ const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \
+ const T_VEC w152[2] = { \
+ sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
+ add(w113[1], \
+ sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \
+ }; \
+ const T_VEC w153[2] = { \
+ add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
+ add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \
+ }; \
+ const T_VEC w154[2] = { \
+ sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
+ sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
+ }; \
+ const T_VEC w155[2] = { \
+ add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
+ add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
+ }; \
+ const T_VEC w156[2] = { \
+ sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
+ add(w117[1], \
+ sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \
+ }; \
+ const T_VEC w157[2] = { \
+ add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
+ add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \
+ }; \
+ store(output + 0 * stride, add(w78[0], w142[0])); \
+ store(output + 1 * stride, \
+ add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \
+ store(output + 2 * stride, \
+ add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \
+ store(output + 3 * stride, \
+ add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \
+ store(output + 4 * stride, \
+ add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \
+ store(output + 5 * stride, \
+ add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \
+ store(output + 6 * stride, \
+ add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \
+ store(output + 7 * stride, \
+ add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \
+ store(output + 8 * stride, add(w79[0], w143[1])); \
+ store(output + 9 * stride, \
+ sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
+ store(output + 10 * stride, \
+ sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
+ store(output + 11 * stride, \
+ sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
+ store(output + 12 * stride, \
+ sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
+ store(output + 13 * stride, \
+ sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
+ store(output + 14 * stride, \
+ sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
+ store(output + 15 * stride, \
+ sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
+ store(output + 16 * stride, sub(w78[0], w142[0])); \
+ store(output + 17 * stride, \
+ add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \
+ mul(kWeight6, w144[1])))); \
+ store(output + 18 * stride, \
+ add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \
+ mul(kWeight4, w146[1])))); \
+ store(output + 19 * stride, \
+ add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \
+ mul(kWeight8, w148[1])))); \
+ store(output + 20 * stride, \
+ add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \
+ mul(kWeight2, w150[1])))); \
+ store(output + 21 * stride, \
+ add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \
+ mul(kWeight7, w152[1])))); \
+ store(output + 22 * stride, \
+ add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \
+ mul(kWeight3, w154[1])))); \
+ store(output + 23 * stride, \
+ add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \
+ mul(kWeight5, w156[1])))); \
+ store(output + 24 * stride, sub(w79[0], w143[1])); \
+ store(output + 25 * stride, \
+ add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
+ store(output + 26 * stride, \
+ add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
+ store(output + 27 * stride, \
+ add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
+ store(output + 28 * stride, \
+ add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
+ store(output + 29 * stride, \
+ add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
+ store(output + 30 * stride, \
+ add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
+ store(output + 31 * stride, \
+ add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
+ }
+
+#endif // AOM_DSP_FFT_COMMON_H_
diff --git a/aom_dsp/noise_model.c b/aom_dsp/noise_model.c
index 117185caf..6c0cf62df 100644
--- a/aom_dsp/noise_model.c
+++ b/aom_dsp/noise_model.c
@@ -18,6 +18,7 @@
#include "aom_dsp/noise_model.h"
#include "aom_dsp/noise_util.h"
#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
#include "av1/encoder/mathutils.h"
#define kLowPolyNumParams 3
@@ -1268,3 +1269,192 @@ int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
film_grain->overlap_flag = 1;
return 1;
}
+
+static void pointwise_multiply(const float *a, float *b, int n) {
+ for (int i = 0; i < n; ++i) {
+ b[i] *= a[i];
+ }
+}
+
+static float *get_half_cos_window(int block_size) {
+ float *window_function =
+ (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
+ for (int y = 0; y < block_size; ++y) {
+ const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
+ for (int x = 0; x < block_size; ++x) {
+ const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
+ window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
+ }
+ }
+ return window_function;
+}
+
+#define DITHER_AND_QUANTIZE(INT_TYPE, suffix) \
+ static void dither_and_quantize_##suffix( \
+ float *result, int result_stride, INT_TYPE *denoised, int w, int h, \
+ int stride, int chroma_sub_w, int chroma_sub_h, int block_size, \
+ float block_normalization) { \
+ for (int y = 0; y < (h >> chroma_sub_h); ++y) { \
+ for (int x = 0; x < (w >> chroma_sub_w); ++x) { \
+ const int result_idx = \
+ (y + (block_size >> chroma_sub_h)) * result_stride + x + \
+ (block_size >> chroma_sub_w); \
+ INT_TYPE new_val = (INT_TYPE)AOMMIN( \
+ AOMMAX(result[result_idx] * block_normalization + 0.5f, 0), \
+ block_normalization); \
+ const float err = \
+ -(((float)new_val) / block_normalization - result[result_idx]); \
+ denoised[y * stride + x] = new_val; \
+ if (x + 1 < (w >> chroma_sub_w)) { \
+ result[result_idx + 1] += err * 7.0f / 16.0f; \
+ } \
+ if (y + 1 < (h >> chroma_sub_h)) { \
+ if (x > 0) { \
+ result[result_idx + result_stride - 1] += err * 3.0f / 16.0f; \
+ } \
+ result[result_idx + result_stride] += err * 5.0f / 16.0f; \
+ if (x + 1 < (w >> chroma_sub_w)) { \
+ result[result_idx + result_stride + 1] += err * 1.0f / 16.0f; \
+ } \
+ } \
+ } \
+ } \
+ }
+
+DITHER_AND_QUANTIZE(uint8_t, lowbd);
+DITHER_AND_QUANTIZE(uint16_t, highbd);
+
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+ int w, int h, int stride[3], int chroma_sub[2],
+ float *noise_psd[3], int block_size, int bit_depth,
+ int use_highbd) {
+ float *plane = NULL, *block = NULL, *window_full = NULL,
+ *window_chroma = NULL;
+ double *block_d = NULL, *plane_d = NULL;
+ struct aom_noise_tx_t *tx_full = NULL;
+ struct aom_noise_tx_t *tx_chroma = NULL;
+ const int num_blocks_w = (w + block_size - 1) / block_size;
+ const int num_blocks_h = (h + block_size - 1) / block_size;
+ const int result_stride = (num_blocks_w + 2) * block_size;
+ const int result_height = (num_blocks_h + 2) * block_size;
+ float *result = NULL;
+ int init_success = 1;
+ aom_flat_block_finder_t block_finder_full;
+ aom_flat_block_finder_t block_finder_chroma;
+ const float kBlockNormalization = (1 << bit_depth) - 1;
+ if (chroma_sub[0] != chroma_sub[1]) {
+ fprintf(stderr,
+ "aom_wiener_denoise_2d doesn't handle different chroma "
+ "subsampling");
+ return 0;
+ }
+ init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
+ bit_depth, use_highbd);
+ result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
+ sizeof(*result));
+ plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
+ block =
+ (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
+ block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
+ plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
+ window_full = get_half_cos_window(block_size);
+ tx_full = aom_noise_tx_malloc(block_size);
+
+ if (chroma_sub[0] != 0) {
+ init_success &= aom_flat_block_finder_init(&block_finder_chroma,
+ block_size >> chroma_sub[0],
+ bit_depth, use_highbd);
+ window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
+ tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
+ } else {
+ window_chroma = window_full;
+ tx_chroma = tx_full;
+ }
+
+ init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
+ (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
+ (window_full != NULL) && (window_chroma != NULL) &&
+ (result != NULL);
+ for (int c = init_success ? 0 : 3; c < 3; ++c) {
+ float *window_function = c == 0 ? window_full : window_chroma;
+ aom_flat_block_finder_t *block_finder = &block_finder_full;
+ const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
+ const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
+ struct aom_noise_tx_t *tx =
+ (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
+ if (!data[c] || !denoised[c]) continue;
+ if (c > 0 && chroma_sub[0] != 0) {
+ block_finder = &block_finder_chroma;
+ }
+ memset(result, 0, sizeof(*result) * result_stride * result_height);
+ // Do overlapped block processing (half overlapped). The block rows can
+ // easily be done in parallel
+ for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
+ offsy += (block_size >> chroma_sub_h) / 2) {
+ for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
+ offsx += (block_size >> chroma_sub_w) / 2) {
+ // Pad the boundary when processing each block-set.
+ for (int by = -1; by < num_blocks_h; ++by) {
+ for (int bx = -1; bx < num_blocks_w; ++bx) {
+ const int pixels_per_block =
+ (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
+ aom_flat_block_finder_extract_block(
+ block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
+ stride[c], bx * (block_size >> chroma_sub_w) + offsx,
+ by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
+ for (int j = 0; j < pixels_per_block; ++j) {
+ block[j] = (float)block_d[j];
+ plane[j] = (float)plane_d[j];
+ }
+ pointwise_multiply(window_function, block, pixels_per_block);
+ aom_noise_tx_forward(tx, block);
+ aom_noise_tx_filter(tx, noise_psd[c]);
+ aom_noise_tx_inverse(tx, block);
+
+ // Apply window function to the plane approximation (we will apply
+ // it to the sum of plane + block when composing the results).
+ pointwise_multiply(window_function, plane, pixels_per_block);
+
+ for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
+ const int y_result =
+ y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
+ for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
+ const int x_result =
+ x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
+ result[y_result * result_stride + x_result] +=
+ (block[y * (block_size >> chroma_sub_w) + x] +
+ plane[y * (block_size >> chroma_sub_w) + x]) *
+ window_function[y * (block_size >> chroma_sub_w) + x];
+ }
+ }
+ }
+ }
+ }
+ }
+ if (use_highbd) {
+ dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
+ w, h, stride[c], chroma_sub_w, chroma_sub_h,
+ block_size, kBlockNormalization);
+ } else {
+ dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
+ stride[c], chroma_sub_w, chroma_sub_h,
+ block_size, kBlockNormalization);
+ }
+ }
+ aom_free(result);
+ aom_free(plane);
+ aom_free(block);
+ aom_free(plane_d);
+ aom_free(block_d);
+ aom_free(window_full);
+
+ aom_noise_tx_free(tx_full);
+
+ aom_flat_block_finder_free(&block_finder_full);
+ if (chroma_sub[0] != 0) {
+ aom_flat_block_finder_free(&block_finder_chroma);
+ aom_free(window_chroma);
+ aom_noise_tx_free(tx_chroma);
+ }
+ return init_success;
+}
diff --git a/aom_dsp/noise_model.h b/aom_dsp/noise_model.h
index 34a364de0..dabeacc14 100644
--- a/aom_dsp/noise_model.h
+++ b/aom_dsp/noise_model.h
@@ -261,6 +261,25 @@ void aom_noise_model_save_latest(aom_noise_model_t *noise_model);
int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
aom_film_grain_t *film_grain);
+/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
+ *
+ * \param[in] data Raw frame data
+ * \param[out] denoised Denoised frame data
+ * \param[in] w Frame width
+ * \param[in] h Frame height
+ * \param[in] stride Stride of the planes
+ * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in] noise_psd The power spectral density of the noise
+ * \param[in] block_size The size of blocks
+ * \param[in] bit_depth Bit depth of the image
+ * \param[in] use_highbd If true, uint8 pointers are interpreted as
+ * uint16 and stride is measured in uint16.
+ * This must be true when bit_depth >= 10.
+ */
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+ int w, int h, int stride[3], int chroma_sub_log2[2],
+ float *noise_psd[3], int block_size, int bit_depth,
+ int use_highbd);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
diff --git a/aom_dsp/noise_util.c b/aom_dsp/noise_util.c
index c81874c6e..db4ca07c7 100644
--- a/aom_dsp/noise_util.c
+++ b/aom_dsp/noise_util.c
@@ -13,9 +13,120 @@
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include "aom_dsp/noise_util.h"
+#include "aom_dsp/fft_common.h"
#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+float aom_noise_psd_get_default_value(int block_size, float factor) {
+ return (factor * factor / 10000) * block_size * block_size / 8;
+}
+
+// Internal representation of noise transform. It keeps track of the
+// transformed data and a temporary working buffer to use during the
+// transform.
+struct aom_noise_tx_t {
+ float *tx_block;
+ float *temp;
+ int block_size;
+ void (*fft)(const float *, float *, float *);
+ void (*ifft)(const float *, float *, float *);
+};
+
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
+ struct aom_noise_tx_t *noise_tx =
+ (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
+ if (!noise_tx) return NULL;
+ memset(noise_tx, 0, sizeof(*noise_tx));
+ switch (block_size) {
+ case 2:
+ noise_tx->fft = aom_fft2x2_float;
+ noise_tx->ifft = aom_ifft2x2_float;
+ break;
+ case 4:
+ noise_tx->fft = aom_fft4x4_float;
+ noise_tx->ifft = aom_ifft4x4_float;
+ break;
+ case 8:
+ noise_tx->fft = aom_fft8x8_float;
+ noise_tx->ifft = aom_ifft8x8_float;
+ break;
+ case 16:
+ noise_tx->fft = aom_fft16x16_float;
+ noise_tx->ifft = aom_ifft16x16_float;
+ break;
+ case 32:
+ noise_tx->fft = aom_fft32x32_float;
+ noise_tx->ifft = aom_ifft32x32_float;
+ break;
+ default:
+ aom_free(noise_tx);
+ fprintf(stderr, "Unsupported block size %d\n", block_size);
+ return NULL;
+ }
+ noise_tx->block_size = block_size;
+ noise_tx->tx_block = (float *)aom_memalign(
+ 32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+ noise_tx->temp = (float *)aom_memalign(
+ 32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
+ if (!noise_tx->tx_block || !noise_tx->temp) {
+ aom_noise_tx_free(noise_tx);
+ return NULL;
+ }
+ return noise_tx;
+}
+
+void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
+ noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
+}
+
+void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
+ const int block_size = noise_tx->block_size;
+ const float kBeta = 1.1f;
+ const float kEps = 1e-6f;
+ for (int y = 0; y < block_size; ++y) {
+ for (int x = 0; x < block_size; ++x) {
+ int i = y * block_size + x;
+ float *c = noise_tx->tx_block + 2 * i;
+ const float p = c[0] * c[0] + c[1] * c[1];
+ if (p > kBeta * psd[i] && p > 1e-6) {
+ noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
+ noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
+ } else {
+ noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
+ noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
+ }
+ }
+ }
+}
+
+void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
+ const int n = noise_tx->block_size * noise_tx->block_size;
+ noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
+ for (int i = 0; i < n; ++i) {
+ data[i] /= n;
+ }
+}
+
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
+ float *psd) {
+ const int block_size = noise_tx->block_size;
+ for (int yb = 0; yb < block_size; ++yb) {
+ for (int xb = 0; xb <= block_size / 2; ++xb) {
+ float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
+ psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
+ }
+ }
+}
+
+void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
+ if (!noise_tx) return;
+ aom_free(noise_tx->tx_block);
+ aom_free(noise_tx->temp);
+ aom_free(noise_tx);
+}
double aom_normalized_cross_correlation(const double *a, const double *b,
int n) {
diff --git a/aom_dsp/noise_util.h b/aom_dsp/noise_util.h
index f3366d86b..ea4d9e3de 100644
--- a/aom_dsp/noise_util.h
+++ b/aom_dsp/noise_util.h
@@ -16,6 +16,44 @@
extern "C" {
#endif // __cplusplus
+// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
+// It is meant to be lightweight and does hold the transformed data (as
+// the user should not be manipulating the transformed data directly).
+struct aom_noise_tx_t;
+
+// Allocates and returns a aom_noise_tx_t useful for denoising the given
+// block_size. The resulting aom_noise_tx_t should be free'd with
+// aom_noise_tx_free.
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
+void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
+
+// Transforms the internal data and holds it in the aom_noise_tx's internal
+// buffer. For compatibility with existing SIMD implementations, "data" must
+// be 32-byte aligned.
+void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
+ const float *data);
+
+// Filters aom_noise_tx's internal data using the provided noise power spectral
+// density. The PSD must be at least block_size * block_size and should be
+// populated with a constant or via estimates taken from
+// aom_noise_tx_add_energy.
+void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
+
+// Performs an inverse transform using the internal transform data.
+// For compatibility with existing SIMD implementations, "data" must be 32-byte
+// aligned.
+void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
+
+// Aggregates the power of the buffered transform data into the psd buffer.
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
+ float *psd);
+
+// Returns a default value suitable for denosing a transform of the given
+// block_size. The noise "factor" determines the strength of the noise to
+// be removed. A value of about 2.5 can be used for moderate denoising,
+// where a value of 5.0 can be used for a high level of denoising.
+float aom_noise_psd_get_default_value(int block_size, float factor);
+
// Computes normalized cross correlation of two vectors a and b of length n.
double aom_normalized_cross_correlation(const double *a, const double *b,
int n);
diff --git a/aom_dsp/x86/fft_avx2.c b/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 000000000..54da02253
--- /dev/null
+++ b/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+ int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps);
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+ aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+ aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+ aom_transpose_float_sse2, 8);
+}
diff --git a/aom_dsp/x86/fft_sse2.c b/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 000000000..12bdc3e18
--- /dev/null
+++ b/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+ const int ldb) {
+ __m128 row1 = _mm_load_ps(&A[0 * lda]);
+ __m128 row2 = _mm_load_ps(&A[1 * lda]);
+ __m128 row3 = _mm_load_ps(&A[2 * lda]);
+ __m128 row4 = _mm_load_ps(&A[3 * lda]);
+ _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+ _mm_store_ps(&B[0 * ldb], row1);
+ _mm_store_ps(&B[1 * ldb], row2);
+ _mm_store_ps(&B[2 * ldb], row3);
+ _mm_store_ps(&B[3 * ldb], row4);
+}
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+ for (int y = 0; y < n; y += 4) {
+ for (int x = 0; x < n; x += 4) {
+ transpose4x4(A + y * n + x, B + x * n + y, n, n);
+ }
+ }
+}
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+ const int n2 = n / 2;
+ output[0] = packed[0];
+ output[1] = 0;
+ output[2 * (n2 * n)] = packed[n2 * n];
+ output[2 * (n2 * n) + 1] = 0;
+
+ output[2 * n2] = packed[n2];
+ output[2 * n2 + 1] = 0;
+ output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+ output[2 * (n2 * n + n2) + 1] = 0;
+
+ for (int c = 1; c < n2; ++c) {
+ output[2 * (0 * n + c)] = packed[c];
+ output[2 * (0 * n + c) + 1] = packed[c + n2];
+ output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+ output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+ }
+ for (int r = 1; r < n2; ++r) {
+ output[2 * (r * n + 0)] = packed[r * n];
+ output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+ output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+ output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+ for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+ output[2 * (r * n + c)] =
+ packed[r * n + c] - packed[(r + n2) * n + c + n2];
+ output[2 * (r * n + c) + 1] =
+ packed[(r + n2) * n + c] + packed[r * n + c + n2];
+ }
+
+ for (int c = 4; c < n2; c += 4) {
+ __m128 real1 = _mm_load_ps(packed + r * n + c);
+ __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+ __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+ __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+ real1 = _mm_sub_ps(real1, real2);
+ imag1 = _mm_add_ps(imag1, imag2);
+ _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+ _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+ }
+
+ int r2 = r + n2;
+ int r3 = n - r2;
+ output[2 * (r2 * n + 0)] = packed[r3 * n];
+ output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+ output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+ output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+ for (int c = 1; c < AOMMIN(4, n2); ++c) {
+ output[2 * (r2 * n + c)] =
+ packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+ output[2 * (r2 * n + c) + 1] =
+ -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+ }
+ for (int c = 4; c < n2; c += 4) {
+ __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+ __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+ __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+ __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+ real1 = _mm_add_ps(real1, real2);
+ imag1 = _mm_sub_ps(imag2, imag1);
+ _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+ _mm_store_ps(output + 2 * (r2 * n + c + 2),
+ _mm_unpackhi_ps(real1, imag1));
+ }
+ }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+ aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+ aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+ aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+ aom_transpose_float_sse2, 4);
+}
diff --git a/test/fft_test.cc b/test/fft_test.cc
new file mode 100644
index 000000000..6a3fd0d04
--- /dev/null
+++ b/test/fft_test.cc
@@ -0,0 +1,253 @@
+#include <math.h>
+
+#include <algorithm>
+#include <complex>
+#include <vector>
+
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#if ARCH_X86 || ARCH_X86_64
+#include "aom_ports/x86.h"
+#endif
+#include "av1/common/common.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+typedef void (*tform_fun_t)(const float *input, float *temp, float *output);
+
+// Simple 1D FFT implementation
+template <typename InputType>
+void fft(const InputType *data, std::complex<float> *result, int n) {
+ if (n == 1) {
+ result[0] = data[0];
+ return;
+ }
+ std::vector<InputType> temp(n);
+ for (int k = 0; k < n / 2; ++k) {
+ temp[k] = data[2 * k];
+ temp[n / 2 + k] = data[2 * k + 1];
+ }
+ fft(&temp[0], result, n / 2);
+ fft(&temp[n / 2], result + n / 2, n / 2);
+ for (int k = 0; k < n / 2; ++k) {
+ std::complex<float> w = std::complex<float>((float)cos(2. * PI * k / n),
+ (float)-sin(2. * PI * k / n));
+ std::complex<float> a = result[k];
+ std::complex<float> b = result[n / 2 + k];
+ result[k] = a + w * b;
+ result[n / 2 + k] = a - w * b;
+ }
+}
+
+void transpose(std::vector<std::complex<float> > *data, int n) {
+ for (int y = 0; y < n; ++y) {
+ for (int x = y + 1; x < n; ++x) {
+ std::swap((*data)[y * n + x], (*data)[x * n + y]);
+ }
+ }
+}
+
+// Simple 2D FFT implementation
+template <class InputType>
+std::vector<std::complex<float> > fft2d(const InputType *input, int n) {
+ std::vector<std::complex<float> > rowfft(n * n);
+ std::vector<std::complex<float> > result(n * n);
+ for (int y = 0; y < n; ++y) {
+ fft(input + y * n, &rowfft[y * n], n);
+ }
+ transpose(&rowfft, n);
+ for (int y = 0; y < n; ++y) {
+ fft(&rowfft[y * n], &result[y * n], n);
+ }
+ transpose(&result, n);
+ return result;
+}
+
+struct FFTTestArg {
+ int n;
+ void (*fft)(const float *input, float *temp, float *output);
+ int flag;
+ FFTTestArg(int n_in, tform_fun_t fft_in, int flag_in)
+ : n(n_in), fft(fft_in), flag(flag_in) {}
+};
+
+class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
+ protected:
+ void SetUp() {
+ int n = GetParam().n;
+ input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n);
+ temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n);
+ output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n * 2);
+ memset(input_, 0, sizeof(*input_) * n * n);
+ memset(temp_, 0, sizeof(*temp_) * n * n);
+ memset(output_, 0, sizeof(*output_) * n * n * 2);
+#if ARCH_X86 || ARCH_X86_64
+ disabled_ = GetParam().flag != 0 && !(x86_simd_caps() & GetParam().flag);
+#else
+ disabled_ = GetParam().flag != 0;
+#endif
+ }
+ void TearDown() {
+ aom_free(input_);
+ aom_free(temp_);
+ aom_free(output_);
+ }
+ int disabled_;
+ float *input_;
+ float *temp_;
+ float *output_;
+};
+
+TEST_P(FFT2DTest, Correct) {
+ if (disabled_) return;
+
+ int n = GetParam().n;
+ for (int i = 0; i < n * n; ++i) {
+ input_[i] = 1;
+ std::vector<std::complex<float> > expected = fft2d<float>(&input_[0], n);
+ GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+ for (int y = 0; y < n; ++y) {
+ for (int x = 0; x < (n / 2) + 1; ++x) {
+ EXPECT_NEAR(expected[y * n + x].real(), output_[2 * (y * n + x)], 1e-5);
+ EXPECT_NEAR(expected[y * n + x].imag(), output_[2 * (y * n + x) + 1],
+ 1e-5);
+ }
+ }
+ input_[i] = 0;
+ }
+}
+
+TEST_P(FFT2DTest, Benchmark) {
+ if (disabled_) return;
+
+ int n = GetParam().n;
+ float sum = 0;
+ for (int i = 0; i < 1000 * (64 - n); ++i) {
+ input_[i % (n * n)] = 1;
+ GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+ sum += output_[0];
+ input_[i % (n * n)] = 0;
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ FFT2DTestC, FFT2DTest,
+ ::testing::Values(FFTTestArg(2, aom_fft2x2_float_c, 0),
+ FFTTestArg(4, aom_fft4x4_float_c, 0),
+ FFTTestArg(8, aom_fft8x8_float_c, 0),
+ FFTTestArg(16, aom_fft16x16_float_c, 0),
+ FFTTestArg(32, aom_fft32x32_float_c, 0)));
+#if ARCH_X86 || ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+ FFT2DTestSSE2, FFT2DTest,
+ ::testing::Values(FFTTestArg(4, aom_fft4x4_float_sse2, HAS_SSE2),
+ FFTTestArg(8, aom_fft8x8_float_sse2, HAS_SSE2),
+ FFTTestArg(16, aom_fft16x16_float_sse2, HAS_SSE2),
+ FFTTestArg(32, aom_fft32x32_float_sse2, HAS_SSE2)));
+
+INSTANTIATE_TEST_CASE_P(
+ FFT2DTestAVX2, FFT2DTest,
+ ::testing::Values(FFTTestArg(8, aom_fft8x8_float_avx2, HAS_AVX2),
+ FFTTestArg(16, aom_fft16x16_float_avx2, HAS_AVX2),
+ FFTTestArg(32, aom_fft32x32_float_avx2, HAS_AVX2)));
+#endif
+
+struct IFFTTestArg {
+ int n;
+ tform_fun_t ifft;
+ int flag;
+ IFFTTestArg(int n_in, tform_fun_t ifft_in, int flag_in)
+ : n(n_in), ifft(ifft_in), flag(flag_in) {}
+};
+
+class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
+ protected:
+ void SetUp() {
+ int n = GetParam().n;
+ input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n * 2);
+ temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n * 2);
+ output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n);
+ memset(input_, 0, sizeof(*input_) * n * n * 2);
+ memset(temp_, 0, sizeof(*temp_) * n * n * 2);
+ memset(output_, 0, sizeof(*output_) * n * n);
+#if ARCH_X86 || ARCH_X86_64
+ disabled_ = GetParam().flag != 0 && !(x86_simd_caps() & GetParam().flag);
+#else
+ disabled_ = GetParam().flag != 0;
+#endif
+ }
+ void TearDown() {
+ aom_free(input_);
+ aom_free(temp_);
+ aom_free(output_);
+ }
+ int disabled_;
+ float *input_;
+ float *temp_;
+ float *output_;
+};
+
+TEST_P(IFFT2DTest, Correctness) {
+ if (disabled_) return;
+ int n = GetParam().n;
+ ASSERT_GE(n, 2);
+ std::vector<float> expected(n * n);
+ std::vector<float> actual(n * n);
+ // Do forward transform then invert to make sure we get back expected
+ for (int y = 0; y < n; ++y) {
+ for (int x = 0; x < n; ++x) {
+ expected[y * n + x] = 1;
+ std::vector<std::complex<float> > input_c = fft2d(&expected[0], n);
+ for (int i = 0; i < n * n; ++i) {
+ input_[2 * i + 0] = input_c[i].real();
+ input_[2 * i + 1] = input_c[i].imag();
+ }
+ GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+
+ for (int yy = 0; yy < n; ++yy) {
+ for (int xx = 0; xx < n; ++xx) {
+ EXPECT_NEAR(expected[yy * n + xx], output_[yy * n + xx] / (n * n),
+ 1e-5);
+ }
+ }
+ expected[y * n + x] = 0;
+ }
+ }
+};
+
+TEST_P(IFFT2DTest, Benchmark) {
+ if (disabled_) return;
+ int n = GetParam().n;
+ float sum = 0;
+ for (int i = 0; i < 1000 * (64 - n); ++i) {
+ input_[i % (n * n)] = 1;
+ GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+ sum += output_[0];
+ input_[i % (n * n)] = 0;
+ }
+}
+INSTANTIATE_TEST_CASE_P(
+ IFFT2DTestC, IFFT2DTest,
+ ::testing::Values(IFFTTestArg(2, aom_ifft2x2_float_c, 0),
+ IFFTTestArg(4, aom_ifft4x4_float_c, 0),
+ IFFTTestArg(8, aom_ifft8x8_float_c, 0),
+ IFFTTestArg(16, aom_ifft16x16_float_c, 0),
+ IFFTTestArg(32, aom_ifft32x32_float_c, 0)));
+#if ARCH_X86 || ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+ IFFT2DTestSSE2, IFFT2DTest,
+ ::testing::Values(IFFTTestArg(4, aom_ifft4x4_float_sse2, HAS_SSE2),
+ IFFTTestArg(8, aom_ifft8x8_float_sse2, HAS_SSE2),
+ IFFTTestArg(16, aom_ifft16x16_float_sse2, HAS_SSE2),
+ IFFTTestArg(32, aom_ifft32x32_float_sse2, HAS_SSE2)));
+
+INSTANTIATE_TEST_CASE_P(
+ IFFT2DTestAVX2, IFFT2DTest,
+ ::testing::Values(IFFTTestArg(8, aom_ifft8x8_float_avx2, HAS_AVX2),
+ IFFTTestArg(16, aom_ifft16x16_float_avx2, HAS_AVX2),
+ IFFTTestArg(32, aom_ifft32x32_float_avx2, HAS_AVX2)));
+#endif
+} // namespace
diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index d454c3125..26c4aa603 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc
@@ -4,6 +4,7 @@
#include "aom_dsp/noise_model.h"
#include "aom_dsp/noise_util.h"
+#include "config/aom_dsp_rtcd.h"
#include "test/acm_random.h"
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -60,6 +61,45 @@ void noise_synth(libaom_test::ACMRandom *random, int lag, int n,
}
}
+std::vector<float> get_noise_psd(double *noise, int width, int height,
+ int block_size) {
+ float *block =
+ (float *)aom_memalign(32, block_size * block_size * sizeof(block));
+ std::vector<float> psd(block_size * block_size);
+ int num_blocks = 0;
+ struct aom_noise_tx_t *tx = aom_noise_tx_malloc(block_size);
+ for (int y = 0; y <= height - block_size; y += block_size / 2) {
+ for (int x = 0; x <= width - block_size; x += block_size / 2) {
+ for (int yy = 0; yy < block_size; ++yy) {
+ for (int xx = 0; xx < block_size; ++xx) {
+ block[yy * block_size + xx] = (float)noise[(y + yy) * width + x + xx];
+ }
+ }
+ aom_noise_tx_forward(tx, &block[0]);
+ aom_noise_tx_add_energy(tx, &psd[0]);
+ num_blocks++;
+ }
+ }
+ for (int yy = 0; yy < block_size; ++yy) {
+ for (int xx = 0; xx <= block_size / 2; ++xx) {
+ psd[yy * block_size + xx] /= num_blocks;
+ }
+ }
+ // Fill in the data that is missing due to symmetries
+ for (int xx = 1; xx < block_size / 2; ++xx) {
+ psd[(block_size - xx)] = psd[xx];
+ }
+ for (int yy = 1; yy < block_size; ++yy) {
+ for (int xx = 1; xx < block_size / 2; ++xx) {
+ psd[(block_size - yy) * block_size + (block_size - xx)] =
+ psd[yy * block_size + xx];
+ }
+ }
+ aom_noise_tx_free(tx);
+ aom_free(block);
+ return psd;
+}
+
} // namespace
TEST(NoiseStrengthSolver, GetCentersTwoBins) {
@@ -1098,3 +1138,193 @@ TEST(NoiseModelGetGrainParameters, GetGrainParametersReal) {
aom_noise_model_free(&model);
}
+
+template <typename T>
+class WienerDenoiseTest : public ::testing::Test, public T {
+ public:
+ static void SetUpTestCase() { aom_dsp_rtcd(); }
+
+ protected:
+ void SetUp() {
+ static const float kNoiseLevel = 5.f;
+ static const float kStd = 4.0;
+ static const double kMaxValue = (1 << T::kBitDepth) - 1;
+
+ chroma_sub_[0] = 1;
+ chroma_sub_[1] = 1;
+ stride_[0] = kWidth;
+ stride_[1] = kWidth / 2;
+ stride_[2] = kWidth / 2;
+ for (int k = 0; k < 3; ++k) {
+ data_[k].resize(kWidth * kHeight);
+ denoised_[k].resize(kWidth * kHeight);
+ noise_psd_[k].resize(kBlockSize * kBlockSize);
+ }
+
+ const double kCoeffsY[] = { 0.0406, -0.116, -0.078, -0.152, 0.0033, -0.093,
+ 0.048, 0.404, 0.2353, -0.035, -0.093, 0.441 };
+ const int kCoords[12][2] = {
+ { -2, -2 }, { -1, -2 }, { 0, -2 }, { 1, -2 }, { 2, -2 }, { -2, -1 },
+ { -1, -1 }, { 0, -1 }, { 1, -1 }, { 2, -1 }, { -2, 0 }, { -1, 0 }
+ };
+ const int kLag = 2;
+ const int kLength = 12;
+ libaom_test::ACMRandom random;
+ std::vector<double> noise(kWidth * kHeight);
+ noise_synth(&random, kLag, kLength, kCoords, kCoeffsY, &noise[0], kWidth,
+ kHeight);
+ noise_psd_[0] = get_noise_psd(&noise[0], kWidth, kHeight, kBlockSize);
+ for (int i = 0; i < kBlockSize * kBlockSize; ++i) {
+ noise_psd_[0][i] = (float)(noise_psd_[0][i] * kStd * kStd * kScaleNoise *
+ kScaleNoise / (kMaxValue * kMaxValue));
+ }
+
+ float psd_value =
+ aom_noise_psd_get_default_value(kBlockSizeChroma, kNoiseLevel);
+ for (int i = 0; i < kBlockSizeChroma * kBlockSizeChroma; ++i) {
+ noise_psd_[1][i] = psd_value;
+ noise_psd_[2][i] = psd_value;
+ }
+ for (int y = 0; y < kHeight; ++y) {
+ for (int x = 0; x < kWidth; ++x) {
+ data_[0][y * stride_[0] + x] = (typename T::data_type_t)fclamp(
+ (x + noise[y * stride_[0] + x] * kStd) * kScaleNoise, 0, kMaxValue);
+ }
+ }
+
+ for (int c = 1; c < 3; ++c) {
+ for (int y = 0; y < (kHeight >> 1); ++y) {
+ for (int x = 0; x < (kWidth >> 1); ++x) {
+ data_[c][y * stride_[c] + x] = (typename T::data_type_t)fclamp(
+ (x + randn(&random, kStd)) * kScaleNoise, 0, kMaxValue);
+ }
+ }
+ }
+ for (int k = 0; k < 3; ++k) {
+ noise_psd_ptrs_[k] = &noise_psd_[k][0];
+ }
+ }
+ static const int kBlockSize = 32;
+ static const int kBlockSizeChroma = 16;
+ static const int kWidth = 256;
+ static const int kHeight = 256;
+ static const int kScaleNoise = 1 << (T::kBitDepth - 8);
+
+ std::vector<typename T::data_type_t> data_[3];
+ std::vector<typename T::data_type_t> denoised_[3];
+ std::vector<float> noise_psd_[3];
+ int chroma_sub_[2];
+ float *noise_psd_ptrs_[3];
+ int stride_[3];
+};
+
+TYPED_TEST_CASE_P(WienerDenoiseTest);
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidBlockSize) {
+ const uint8_t *const data_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+ };
+ uint8_t *denoised_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+ };
+ EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+ this->kHeight, this->stride_,
+ this->chroma_sub_, this->noise_psd_ptrs_,
+ 18, this->kBitDepth, this->kUseHighBD));
+ EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+ this->kHeight, this->stride_,
+ this->chroma_sub_, this->noise_psd_ptrs_,
+ 48, this->kBitDepth, this->kUseHighBD));
+ EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+ this->kHeight, this->stride_,
+ this->chroma_sub_, this->noise_psd_ptrs_,
+ 64, this->kBitDepth, this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidChromaSubsampling) {
+ const uint8_t *const data_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+ };
+ uint8_t *denoised_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+ };
+ int chroma_sub[2] = { 1, 0 };
+ EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+ this->kHeight, this->stride_, chroma_sub,
+ this->noise_psd_ptrs_, 32, this->kBitDepth,
+ this->kUseHighBD));
+
+ chroma_sub[0] = 0;
+ chroma_sub[1] = 1;
+ EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+ this->kHeight, this->stride_, chroma_sub,
+ this->noise_psd_ptrs_, 32, this->kBitDepth,
+ this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, GradientTest) {
+ const int kWidth = this->kWidth;
+ const int kHeight = this->kHeight;
+ const int kBlockSize = this->kBlockSize;
+ const uint8_t *const data_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+ };
+ uint8_t *denoised_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+ };
+ const int ret = aom_wiener_denoise_2d(
+ data_ptrs, denoised_ptrs, kWidth, kHeight, this->stride_,
+ this->chroma_sub_, this->noise_psd_ptrs_, this->kBlockSize,
+ this->kBitDepth, this->kUseHighBD);
+ EXPECT_EQ(1, ret);
+
+ // Check the noise on the denoised image (from the analytical gradient)
+ // and make sure that it is less than what we added.
+ for (int c = 0; c < 3; ++c) {
+ std::vector<double> measured_noise(kWidth * kHeight);
+
+ double var = 0;
+ for (int x = 0; x<kWidth>> (c > 0); ++x) {
+ for (int y = 0; y<kHeight>> (c > 0); ++y) {
+ const double diff = this->denoised_[c][y * this->stride_[c] + x] -
+ x * this->kScaleNoise;
+ var += diff * diff;
+ measured_noise[y * kWidth + x] = diff;
+ }
+ }
+ var /= (kWidth * kHeight);
+ const double std = sqrt(std::max(0.0, var));
+ EXPECT_LE(std, 1.25f * this->kScaleNoise);
+ if (c == 0) {
+ std::vector<float> measured_psd =
+ get_noise_psd(&measured_noise[0], kWidth, kHeight, kBlockSize);
+ std::vector<double> measured_psd_d(kBlockSize * kBlockSize);
+ std::vector<double> noise_psd_d(kBlockSize * kBlockSize);
+ std::copy(measured_psd.begin(), measured_psd.end(),
+ measured_psd_d.begin());
+ std::copy(this->noise_psd_[0].begin(), this->noise_psd_[0].end(),
+ noise_psd_d.begin());
+ EXPECT_LT(aom_normalized_cross_correlation(
+ &measured_psd_d[0], &noise_psd_d[0], noise_psd_d.size()),
+ 0.35);
+ }
+ }
+}
+
+REGISTER_TYPED_TEST_CASE_P(WienerDenoiseTest, InvalidBlockSize,
+ InvalidChromaSubsampling, GradientTest);
+
+INSTANTIATE_TYPED_TEST_CASE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest,
+ AllBitDepthParams);
diff --git a/test/test.cmake b/test/test.cmake
index cf1b79292..143b88680 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -174,6 +174,7 @@ if(CONFIG_AV1_ENCODER)
"${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
"${AOM_ROOT}/test/blend_a64_mask_test.cc"
"${AOM_ROOT}/test/error_block_test.cc"
+ "${AOM_ROOT}/test/fft_test.cc"
"${AOM_ROOT}/test/fwht4x4_test.cc"
"${AOM_ROOT}/test/masked_sad_test.cc"
"${AOM_ROOT}/test/masked_variance_test.cc"