summaryrefslogtreecommitdiff
path: root/libopus/celt/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libopus/celt/x86')
-rw-r--r--libopus/celt/x86/celt_lpc_sse.h66
-rw-r--r--libopus/celt/x86/celt_lpc_sse4_1.c89
-rw-r--r--libopus/celt/x86/pitch_sse.c185
-rw-r--r--libopus/celt/x86/pitch_sse.h192
-rw-r--r--libopus/celt/x86/pitch_sse2.c95
-rw-r--r--libopus/celt/x86/pitch_sse4_1.c195
-rw-r--r--libopus/celt/x86/vq_sse.h50
-rw-r--r--libopus/celt/x86/vq_sse2.c217
-rw-r--r--libopus/celt/x86/x86_celt_map.c167
-rw-r--r--libopus/celt/x86/x86cpu.c157
-rw-r--r--libopus/celt/x86/x86cpu.h95
11 files changed, 1508 insertions, 0 deletions
diff --git a/libopus/celt/x86/celt_lpc_sse.h b/libopus/celt/x86/celt_lpc_sse.h
new file mode 100644
index 0000000..7d1ecf7
--- /dev/null
+++ b/libopus/celt/x86/celt_lpc_sse.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_LPC_SSE_H
+#define CELT_LPC_SSE_H
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_CELT_FIR
+
+void celt_fir_sse4_1(
+ const opus_val16 *x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ int arch);
+
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define celt_fir(x, num, y, N, ord, arch) \
+ ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
+
+#else
+
+extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ int arch);
+
+# define celt_fir(x, num, y, N, ord, arch) \
+ ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
+
+#endif
+#endif
+
+#endif
diff --git a/libopus/celt/x86/celt_lpc_sse4_1.c b/libopus/celt/x86/celt_lpc_sse4_1.c
new file mode 100644
index 0000000..5478568
--- /dev/null
+++ b/libopus/celt/x86/celt_lpc_sse4_1.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+#include "x86cpu.h"
+
+#if defined(FIXED_POINT)
+
+void celt_fir_sse4_1(const opus_val16 *x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ int arch)
+{
+ int i,j;
+ VARDECL(opus_val16, rnum);
+
+ __m128i vecNoA;
+ opus_int32 noA ;
+ SAVE_STACK;
+
+ ALLOC(rnum, ord, opus_val16);
+ for(i=0;i<ord;i++)
+ rnum[i] = num[ord-i-1];
+ noA = EXTEND32(1) << SIG_SHIFT >> 1;
+ vecNoA = _mm_set_epi32(noA, noA, noA, noA);
+
+ for (i=0;i<N-3;i+=4)
+ {
+ opus_val32 sums[4] = {0};
+ __m128i vecSum, vecX;
+
+ xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
+
+ vecSum = _mm_loadu_si128((__m128i *)sums);
+ vecSum = _mm_add_epi32(vecSum, vecNoA);
+ vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
+ vecX = OP_CVTEPI16_EPI32_M64(x + i);
+ vecSum = _mm_add_epi32(vecSum, vecX);
+ vecSum = _mm_packs_epi32(vecSum, vecSum);
+ _mm_storel_epi64((__m128i *)(y + i), vecSum);
+ }
+ for (;i<N;i++)
+ {
+ opus_val32 sum = 0;
+ for (j=0;j<ord;j++)
+ sum = MAC16_16(sum, rnum[j], x[i+j-ord]);
+ y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
+ }
+
+ RESTORE_STACK;
+}
+
+#endif
diff --git a/libopus/celt/x86/pitch_sse.c b/libopus/celt/x86/pitch_sse.c
new file mode 100644
index 0000000..20e7312
--- /dev/null
+++ b/libopus/celt/x86/pitch_sse.c
@@ -0,0 +1,185 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "macros.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
+
+#include <xmmintrin.h>
+#include "arch.h"
+
+void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
+{
+ int j;
+ __m128 xsum1, xsum2;
+ xsum1 = _mm_loadu_ps(sum);
+ xsum2 = _mm_setzero_ps();
+
+ for (j = 0; j < len-3; j += 4)
+ {
+ __m128 x0 = _mm_loadu_ps(x+j);
+ __m128 yj = _mm_loadu_ps(y+j);
+ __m128 y3 = _mm_loadu_ps(y+j+3);
+
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
+ _mm_shuffle_ps(yj,y3,0x49)));
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
+ _mm_shuffle_ps(yj,y3,0x9e)));
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
+ }
+ if (j < len)
+ {
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+ if (++j < len)
+ {
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+ if (++j < len)
+ {
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+ }
+ }
+ }
+ _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
+}
+
+
+void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+ int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+ int i;
+ __m128 xsum1, xsum2;
+ xsum1 = _mm_setzero_ps();
+ xsum2 = _mm_setzero_ps();
+ for (i=0;i<N-3;i+=4)
+ {
+ __m128 xi = _mm_loadu_ps(x+i);
+ __m128 y1i = _mm_loadu_ps(y01+i);
+ __m128 y2i = _mm_loadu_ps(y02+i);
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
+ }
+ /* Horizontal sum */
+ xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
+ xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
+ _mm_store_ss(xy1, xsum1);
+ xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
+ xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
+ _mm_store_ss(xy2, xsum2);
+ for (;i<N;i++)
+ {
+ *xy1 = MAC16_16(*xy1, x[i], y01[i]);
+ *xy2 = MAC16_16(*xy2, x[i], y02[i]);
+ }
+}
+
+opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
+ int N)
+{
+ int i;
+ float xy;
+ __m128 sum;
+ sum = _mm_setzero_ps();
+ /* FIXME: We should probably go 8-way and use 2 sums. */
+ for (i=0;i<N-3;i+=4)
+ {
+ __m128 xi = _mm_loadu_ps(x+i);
+ __m128 yi = _mm_loadu_ps(y+i);
+ sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
+ }
+ /* Horizontal sum */
+ sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+ sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+ _mm_store_ss(&xy, sum);
+ for (;i<N;i++)
+ {
+ xy = MAC16_16(xy, x[i], y[i]);
+ }
+ return xy;
+}
+
+void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
+ opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+ int i;
+ __m128 x0v;
+ __m128 g10v, g11v, g12v;
+ g10v = _mm_load1_ps(&g10);
+ g11v = _mm_load1_ps(&g11);
+ g12v = _mm_load1_ps(&g12);
+ x0v = _mm_loadu_ps(&x[-T-2]);
+ for (i=0;i<N-3;i+=4)
+ {
+ __m128 yi, yi2, x1v, x2v, x3v, x4v;
+ const opus_val32 *xp = &x[i-T-2];
+ yi = _mm_loadu_ps(x+i);
+ x4v = _mm_loadu_ps(xp+4);
+#if 0
+ /* Slower version with all loads */
+ x1v = _mm_loadu_ps(xp+1);
+ x2v = _mm_loadu_ps(xp+2);
+ x3v = _mm_loadu_ps(xp+3);
+#else
+ x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
+ x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
+ x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
+#endif
+
+ yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
+ yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
+ yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#else
+ /* Use partial sums */
+ yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
+ _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+ yi = _mm_add_ps(yi, yi2);
+#endif
+ x0v=x4v;
+ _mm_storeu_ps(y+i, yi);
+ }
+#ifdef CUSTOM_MODES
+ for (;i<N;i++)
+ {
+ y[i] = x[i]
+ + MULT16_32_Q15(g10,x[i-T])
+ + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+ + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
+ }
+#endif
+}
+
+
+#endif
diff --git a/libopus/celt/x86/pitch_sse.h b/libopus/celt/x86/pitch_sse.h
new file mode 100644
index 0000000..e5f87ab
--- /dev/null
+++ b/libopus/celt/x86/pitch_sse.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2013 Jean-Marc Valin and John Ridges
+ Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/
+/**
+ @file pitch_sse.h
+ @brief Pitch analysis
+ */
+
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef PITCH_SSE_H
+#define PITCH_SSE_H
+
+#if defined(HAVE_CONFIG_H)
+#include "config.h"
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+void xcorr_kernel_sse4_1(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ opus_val32 sum[4],
+ int len);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
+void xcorr_kernel_sse(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ opus_val32 sum[4],
+ int len);
+#endif
+
+#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+ ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len))
+
+#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+ ((void)arch, xcorr_kernel_sse(x, y, sum, len))
+
+#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+
+extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ opus_val32 sum[4],
+ int len);
+
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+ ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
+
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse4_1(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ int N);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse2(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ int N);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ int N);
+#endif
+
+
+#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+ ((void)arch, celt_inner_prod_sse4_1(x, y, N))
+
+#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+ ((void)arch, celt_inner_prod_sse2(x, y, N))
+
+#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+ ((void)arch, celt_inner_prod_sse(x, y, N))
+
+
+#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ int N);
+
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+ ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
+
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
+
+#undef dual_inner_prod
+#undef comb_filter_const
+
+void dual_inner_prod_sse(const opus_val16 *x,
+ const opus_val16 *y01,
+ const opus_val16 *y02,
+ int N,
+ opus_val32 *xy1,
+ opus_val32 *xy2);
+
+void comb_filter_const_sse(opus_val32 *y,
+ opus_val32 *x,
+ int T,
+ int N,
+ opus_val16 g10,
+ opus_val16 g11,
+ opus_val16 g12);
+
+
+#if defined(OPUS_X86_PRESUME_SSE)
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
+ ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
+
+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
+ ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))
+#else
+
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y01,
+ const opus_val16 *y02,
+ int N,
+ opus_val32 *xy1,
+ opus_val32 *xy2);
+
+#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
+ ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+
+extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
+ opus_val32 *y,
+ opus_val32 *x,
+ int T,
+ int N,
+ opus_val16 g10,
+ opus_val16 g11,
+ opus_val16 g12);
+
+#define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
+ ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12))
+
+#define NON_STATIC_COMB_FILTER_CONST_C
+
+#endif
+#endif
+
+#endif
diff --git a/libopus/celt/x86/pitch_sse2.c b/libopus/celt/x86/pitch_sse2.c
new file mode 100644
index 0000000..a0e7d1b
--- /dev/null
+++ b/libopus/celt/x86/pitch_sse2.c
@@ -0,0 +1,95 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "macros.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
+ int N)
+{
+ opus_int i, dataSize16;
+ opus_int32 sum;
+
+ __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+ __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+
+ sum = 0;
+ dataSize16 = N & ~15;
+
+ acc1 = _mm_setzero_si128();
+ acc2 = _mm_setzero_si128();
+
+ for (i=0;i<dataSize16;i+=16)
+ {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+ inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+ inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+ }
+
+ acc1 = _mm_add_epi32( acc1, acc2 );
+
+ if (N - i >= 8)
+ {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ i += 8;
+ }
+
+ acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
+ acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
+ sum += _mm_cvtsi128_si32(acc1);
+
+ for (;i<N;i++) {
+ sum = silk_SMLABB(sum, x[i], y[i]);
+ }
+
+ return sum;
+}
+#endif
diff --git a/libopus/celt/x86/pitch_sse4_1.c b/libopus/celt/x86/pitch_sse4_1.c
new file mode 100644
index 0000000..a092c68
--- /dev/null
+++ b/libopus/celt/x86/pitch_sse4_1.c
@@ -0,0 +1,195 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "macros.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+#include <smmintrin.h>
+#include "x86cpu.h"
+
+opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
+ int N)
+{
+ opus_int i, dataSize16;
+ opus_int32 sum;
+ __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+ __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+ __m128i inVec1_3210, inVec2_3210;
+
+ sum = 0;
+ dataSize16 = N & ~15;
+
+ acc1 = _mm_setzero_si128();
+ acc2 = _mm_setzero_si128();
+
+ for (i=0;i<dataSize16;i+=16) {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+ inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+ inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+ }
+
+ acc1 = _mm_add_epi32(acc1, acc2);
+
+ if (N - i >= 8)
+ {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ i += 8;
+ }
+
+ if (N - i >= 4)
+ {
+ inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
+ inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
+
+ inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_3210);
+ i += 4;
+ }
+
+ acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
+ acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
+
+ sum += _mm_cvtsi128_si32(acc1);
+
+ for (;i<N;i++)
+ {
+ sum = silk_SMLABB(sum, x[i], y[i]);
+ }
+
+ return sum;
+}
+
+void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
+{
+ int j;
+
+ __m128i vecX, vecX0, vecX1, vecX2, vecX3;
+ __m128i vecY0, vecY1, vecY2, vecY3;
+ __m128i sum0, sum1, sum2, sum3, vecSum;
+ __m128i initSum;
+
+ celt_assert(len >= 3);
+
+ sum0 = _mm_setzero_si128();
+ sum1 = _mm_setzero_si128();
+ sum2 = _mm_setzero_si128();
+ sum3 = _mm_setzero_si128();
+
+ for (j=0;j<(len-7);j+=8)
+ {
+ vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
+ vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
+ vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
+ vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
+ vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
+
+ sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
+ sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
+ sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
+ sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
+ }
+
+ sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
+ sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
+
+ sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
+ sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
+
+ sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
+ sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
+
+ sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
+ sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
+
+ vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
+ _mm_unpacklo_epi32(sum2, sum3));
+
+ for (;j<(len-3);j+=4)
+ {
+ vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+ vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+ vecX1 = _mm_shuffle_epi32(vecX, 0x55);
+ vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX3 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+ vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
+ vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+ sum2 = _mm_mullo_epi32(vecX2, vecY2);
+ sum3 = _mm_mullo_epi32(vecX3, vecY3);
+
+ sum0 = _mm_add_epi32(sum0, sum1);
+ sum2 = _mm_add_epi32(sum2, sum3);
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum2);
+ }
+
+ for (;j<len;j++)
+ {
+ vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+ vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ }
+
+ initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
+ initSum = _mm_add_epi32(initSum, vecSum);
+ _mm_storeu_si128((__m128i *)sum, initSum);
+}
+#endif
diff --git a/libopus/celt/x86/vq_sse.h b/libopus/celt/x86/vq_sse.h
new file mode 100644
index 0000000..b4efe8f
--- /dev/null
+++ b/libopus/celt/x86/vq_sse.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef VQ_SSE_H
+#define VQ_SSE_H
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+#define OVERRIDE_OP_PVQ_SEARCH
+
+opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch);
+
+#if defined(OPUS_X86_PRESUME_SSE2)
+#define op_pvq_search(x, iy, K, N, arch) \
+ (op_pvq_search_sse2(x, iy, K, N, arch))
+
+#else
+
+extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
+ celt_norm *_X, int *iy, int K, int N, int arch);
+
+# define op_pvq_search(X, iy, K, N, arch) \
+ ((*OP_PVQ_SEARCH_IMPL[(arch) & OPUS_ARCHMASK])(X, iy, K, N, arch))
+
+#endif
+#endif
+
+#endif
diff --git a/libopus/celt/x86/vq_sse2.c b/libopus/celt/x86/vq_sse2.c
new file mode 100644
index 0000000..7750428
--- /dev/null
+++ b/libopus/celt/x86/vq_sse2.c
@@ -0,0 +1,217 @@
+/* Copyright (c) 2007-2008 CSIRO
+ Copyright (c) 2007-2009 Xiph.Org Foundation
+ Copyright (c) 2007-2016 Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "vq.h"
+#include "x86cpu.h"
+
+
+#ifndef FIXED_POINT
+
+opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
+{
+ int i, j;
+ int pulsesLeft;
+ float xy, yy;
+ VARDECL(celt_norm, y);
+ VARDECL(celt_norm, X);
+ VARDECL(float, signy);
+ __m128 signmask;
+ __m128 sums;
+ __m128i fours;
+ SAVE_STACK;
+
+ (void)arch;
+ /* All bits set to zero, except for the sign bit. */
+ signmask = _mm_set_ps1(-0.f);
+ fours = _mm_set_epi32(4, 4, 4, 4);
+ ALLOC(y, N+3, celt_norm);
+ ALLOC(X, N+3, celt_norm);
+ ALLOC(signy, N+3, float);
+
+ OPUS_COPY(X, _X, N);
+ X[N] = X[N+1] = X[N+2] = 0;
+ sums = _mm_setzero_ps();
+ for (j=0;j<N;j+=4)
+ {
+ __m128 x4, s4;
+ x4 = _mm_loadu_ps(&X[j]);
+ s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
+ /* Get rid of the sign */
+ x4 = _mm_andnot_ps(signmask, x4);
+ sums = _mm_add_ps(sums, x4);
+ /* Clear y and iy in case we don't do the projection. */
+ _mm_storeu_ps(&y[j], _mm_setzero_ps());
+ _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
+ _mm_storeu_ps(&X[j], x4);
+ _mm_storeu_ps(&signy[j], s4);
+ }
+ sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)));
+ sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1)));
+
+ xy = yy = 0;
+
+ pulsesLeft = K;
+
+ /* Do a pre-search by projecting on the pyramid */
+ if (K > (N>>1))
+ {
+ __m128i pulses_sum;
+ __m128 yy4, xy4;
+ __m128 rcp4;
+ opus_val32 sum = _mm_cvtss_f32(sums);
+ /* If X is too small, just replace it with a pulse at 0 */
+ /* Prevents infinities and NaNs from causing too many pulses
+ to be allocated. 64 is an approximation of infinity here. */
+ if (!(sum > EPSILON && sum < 64))
+ {
+ X[0] = QCONST16(1.f,14);
+ j=1; do
+ X[j]=0;
+ while (++j<N);
+ sums = _mm_set_ps1(1.f);
+ }
+ /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
+ rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums));
+ xy4 = yy4 = _mm_setzero_ps();
+ pulses_sum = _mm_setzero_si128();
+ for (j=0;j<N;j+=4)
+ {
+ __m128 rx4, x4, y4;
+ __m128i iy4;
+ x4 = _mm_loadu_ps(&X[j]);
+ rx4 = _mm_mul_ps(x4, rcp4);
+ iy4 = _mm_cvttps_epi32(rx4);
+ pulses_sum = _mm_add_epi32(pulses_sum, iy4);
+ _mm_storeu_si128((__m128i*)&iy[j], iy4);
+ y4 = _mm_cvtepi32_ps(iy4);
+ xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
+ yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
+ /* double the y[] vector so we don't have to do it in the search loop. */
+ _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
+ }
+ pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2)));
+ pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1)));
+ pulsesLeft -= _mm_cvtsi128_si32(pulses_sum);
+ xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2)));
+ xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1)));
+ xy = _mm_cvtss_f32(xy4);
+ yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2)));
+ yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1)));
+ yy = _mm_cvtss_f32(yy4);
+ }
+ X[N] = X[N+1] = X[N+2] = -100;
+ y[N] = y[N+1] = y[N+2] = 100;
+ celt_sig_assert(pulsesLeft>=0);
+
+ /* This should never happen, but just in case it does (e.g. on silence)
+ we fill the first bin with pulses. */
+ if (pulsesLeft > N+3)
+ {
+ opus_val16 tmp = (opus_val16)pulsesLeft;
+ yy = MAC16_16(yy, tmp, tmp);
+ yy = MAC16_16(yy, tmp, y[0]);
+ iy[0] += pulsesLeft;
+ pulsesLeft=0;
+ }
+
+ for (i=0;i<pulsesLeft;i++)
+ {
+ int best_id;
+ __m128 xy4, yy4;
+ __m128 max, max2;
+ __m128i count;
+ __m128i pos;
+ /* The squared magnitude term gets added anyway, so we might as well
+ add it outside the loop */
+ yy = ADD16(yy, 1);
+ xy4 = _mm_load1_ps(&xy);
+ yy4 = _mm_load1_ps(&yy);
+ max = _mm_setzero_ps();
+ pos = _mm_setzero_si128();
+ count = _mm_set_epi32(3, 2, 1, 0);
+ for (j=0;j<N;j+=4)
+ {
+ __m128 x4, y4, r4;
+ x4 = _mm_loadu_ps(&X[j]);
+ y4 = _mm_loadu_ps(&y[j]);
+ x4 = _mm_add_ps(x4, xy4);
+ y4 = _mm_add_ps(y4, yy4);
+ y4 = _mm_rsqrt_ps(y4);
+ r4 = _mm_mul_ps(x4, y4);
+ /* Update the index of the max. */
+ pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
+ /* Update the max. */
+ max = _mm_max_ps(max, r4);
+ /* Update the indices (+4) */
+ count = _mm_add_epi32(count, fours);
+ }
+ /* Horizontal max */
+ max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
+ max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
+ /* Now that max2 contains the max at all positions, look at which value(s) of the
+ partial max is equal to the global max. */
+ pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
+ pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
+ pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
+ best_id = _mm_cvtsi128_si32(pos);
+
+ /* Updating the sums of the new pulse(s) */
+ xy = ADD32(xy, EXTEND32(X[best_id]));
+ /* We're multiplying y[j] by two so we don't have to do it here */
+ yy = ADD16(yy, y[best_id]);
+
+ /* Only now that we've made the final choice, update y/iy */
+ /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
+ y[best_id] += 2;
+ iy[best_id]++;
+ }
+
+ /* Put the original sign back */
+ for (j=0;j<N;j+=4)
+ {
+ __m128i y4;
+ __m128i s4;
+ y4 = _mm_loadu_si128((__m128i*)&iy[j]);
+ s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
+ y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
+ _mm_storeu_si128((__m128i*)&iy[j], y4);
+ }
+ RESTORE_STACK;
+ return yy;
+}
+
+#endif
diff --git a/libopus/celt/x86/x86_celt_map.c b/libopus/celt/x86/x86_celt_map.c
new file mode 100644
index 0000000..d39d88e
--- /dev/null
+++ b/libopus/celt/x86/x86_celt_map.c
@@ -0,0 +1,167 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(HAVE_CONFIG_H)
+#include "config.h"
+#endif
+
+#include "x86/x86cpu.h"
+#include "celt_lpc.h"
+#include "pitch.h"
+#include "pitch_sse.h"
+#include "vq.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+# if defined(FIXED_POINT)
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)
+
+void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ int arch
+) = {
+ celt_fir_c, /* non-sse */
+ celt_fir_c,
+ celt_fir_c,
+ MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */
+ MAY_HAVE_SSE4_1(celt_fir) /* avx */
+};
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ opus_val32 sum[4],
+ int len
+) = {
+ xcorr_kernel_c, /* non-sse */
+ xcorr_kernel_c,
+ xcorr_kernel_c,
+ MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */
+ MAY_HAVE_SSE4_1(xcorr_kernel) /* avx */
+};
+
+#endif
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+ (!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))
+
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ int N
+) = {
+ celt_inner_prod_c, /* non-sse */
+ celt_inner_prod_c,
+ MAY_HAVE_SSE2(celt_inner_prod),
+ MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */
+ MAY_HAVE_SSE4_1(celt_inner_prod) /* avx */
+};
+
+#endif
+
+# else
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ opus_val32 sum[4],
+ int len
+) = {
+ xcorr_kernel_c, /* non-sse */
+ MAY_HAVE_SSE(xcorr_kernel),
+ MAY_HAVE_SSE(xcorr_kernel),
+ MAY_HAVE_SSE(xcorr_kernel),
+ MAY_HAVE_SSE(xcorr_kernel)
+};
+
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ int N
+) = {
+ celt_inner_prod_c, /* non-sse */
+ MAY_HAVE_SSE(celt_inner_prod),
+ MAY_HAVE_SSE(celt_inner_prod),
+ MAY_HAVE_SSE(celt_inner_prod),
+ MAY_HAVE_SSE(celt_inner_prod)
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y01,
+ const opus_val16 *y02,
+ int N,
+ opus_val32 *xy1,
+ opus_val32 *xy2
+) = {
+ dual_inner_prod_c, /* non-sse */
+ MAY_HAVE_SSE(dual_inner_prod),
+ MAY_HAVE_SSE(dual_inner_prod),
+ MAY_HAVE_SSE(dual_inner_prod),
+ MAY_HAVE_SSE(dual_inner_prod)
+};
+
+void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
+ opus_val32 *y,
+ opus_val32 *x,
+ int T,
+ int N,
+ opus_val16 g10,
+ opus_val16 g11,
+ opus_val16 g12
+) = {
+ comb_filter_const_c, /* non-sse */
+ MAY_HAVE_SSE(comb_filter_const),
+ MAY_HAVE_SSE(comb_filter_const),
+ MAY_HAVE_SSE(comb_filter_const),
+ MAY_HAVE_SSE(comb_filter_const)
+};
+
+
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)
+opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
+ celt_norm *_X, int *iy, int K, int N, int arch
+) = {
+ op_pvq_search_c, /* non-sse */
+ op_pvq_search_c,
+ MAY_HAVE_SSE2(op_pvq_search),
+ MAY_HAVE_SSE2(op_pvq_search),
+ MAY_HAVE_SSE2(op_pvq_search)
+};
+#endif
+
+#endif
+#endif
diff --git a/libopus/celt/x86/x86cpu.c b/libopus/celt/x86/x86cpu.c
new file mode 100644
index 0000000..080eb25
--- /dev/null
+++ b/libopus/celt/x86/x86cpu.c
@@ -0,0 +1,157 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "cpu_support.h"
+#include "macros.h"
+#include "main.h"
+#include "pitch.h"
+#include "x86cpu.h"
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+ (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
+
+
+#if defined(_MSC_VER)
+
+#include <intrin.h>
+static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
+{
+ __cpuid((int*)CPUInfo, InfoType);
+}
+
+#else
+
+#if defined(CPU_INFO_BY_C)
+#include <cpuid.h>
+#endif
+
+static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
+{
+#if defined(CPU_INFO_BY_ASM)
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx is PIC register in 32-bit, so mustn't clobber it. */
+ __asm__ __volatile__ (
+ "xchg %%ebx, %1\n"
+ "cpuid\n"
+ "xchg %%ebx, %1\n":
+ "=a" (CPUInfo[0]),
+ "=r" (CPUInfo[1]),
+ "=c" (CPUInfo[2]),
+ "=d" (CPUInfo[3]) :
+ "0" (InfoType)
+ );
+#else
+ __asm__ __volatile__ (
+ "cpuid":
+ "=a" (CPUInfo[0]),
+ "=b" (CPUInfo[1]),
+ "=c" (CPUInfo[2]),
+ "=d" (CPUInfo[3]) :
+ "0" (InfoType)
+ );
+#endif
+#elif defined(CPU_INFO_BY_C)
+ __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+#endif
+}
+
+#endif
+
+typedef struct CPU_Feature{
+ /* SIMD: 128-bit */
+ int HW_SSE;
+ int HW_SSE2;
+ int HW_SSE41;
+ /* SIMD: 256-bit */
+ int HW_AVX;
+} CPU_Feature;
+
+static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
+{
+ unsigned int info[4] = {0};
+ unsigned int nIds = 0;
+
+ cpuid(info, 0);
+ nIds = info[0];
+
+ if (nIds >= 1){
+ cpuid(info, 1);
+ cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
+ cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
+ cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
+ cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0;
+ }
+ else {
+ cpu_feature->HW_SSE = 0;
+ cpu_feature->HW_SSE2 = 0;
+ cpu_feature->HW_SSE41 = 0;
+ cpu_feature->HW_AVX = 0;
+ }
+}
+
+int opus_select_arch(void)
+{
+ CPU_Feature cpu_feature;
+ int arch;
+
+ opus_cpu_feature_check(&cpu_feature);
+
+ arch = 0;
+ if (!cpu_feature.HW_SSE)
+ {
+ return arch;
+ }
+ arch++;
+
+ if (!cpu_feature.HW_SSE2)
+ {
+ return arch;
+ }
+ arch++;
+
+ if (!cpu_feature.HW_SSE41)
+ {
+ return arch;
+ }
+ arch++;
+
+ if (!cpu_feature.HW_AVX)
+ {
+ return arch;
+ }
+ arch++;
+
+ return arch;
+}
+
+#endif
diff --git a/libopus/celt/x86/x86cpu.h b/libopus/celt/x86/x86cpu.h
new file mode 100644
index 0000000..1e2bf17
--- /dev/null
+++ b/libopus/celt/x86/x86cpu.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(X86CPU_H)
+# define X86CPU_H
+
+# if defined(OPUS_X86_MAY_HAVE_SSE)
+# define MAY_HAVE_SSE(name) name ## _sse
+# else
+# define MAY_HAVE_SSE(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+# define MAY_HAVE_SSE2(name) name ## _sse2
+# else
+# define MAY_HAVE_SSE2(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# define MAY_HAVE_SSE4_1(name) name ## _sse4_1
+# else
+# define MAY_HAVE_SSE4_1(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_AVX)
+# define MAY_HAVE_AVX(name) name ## _avx
+# else
+# define MAY_HAVE_AVX(name) name ## _c
+# endif
+
+# if defined(OPUS_HAVE_RTCD)
+int opus_select_arch(void);
+# endif
+
+/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
+ or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
+ actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
+ reference, these require 16-byte alignment and load a full 16 bytes (instead
+ of 4 or 8), possibly reading out of bounds.
+
+ We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
+ _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
+ reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
+ optimize this out when optimizations ARE enabled.
+
+ Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
+ (which is fair, since technically the compiler is always allowed to do the
+ dereference before invoking the function implementing the intrinsic).
+ However, it is smart enough to eliminate the extra MOVD instruction.
+ For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
+ the extra MOVQ if it's specified explicitly */
+
+# if defined(__clang__) || !defined(__OPTIMIZE__)
+# define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
+# else
+# define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(*(__m128i *)(x)))
+#endif
+
+/* similar reasoning about the instruction sequence as in the 32-bit macro above,
+ */
+# if defined(__clang__) || !defined(__OPTIMIZE__)
+# define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
+# else
+# define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(*(__m128i *)(x)))
+# endif
+
+#endif