diff options
Diffstat (limited to 'media/libopus/celt/x86/pitch_sse.c')
-rw-r--r-- | media/libopus/celt/x86/pitch_sse.c | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/media/libopus/celt/x86/pitch_sse.c b/media/libopus/celt/x86/pitch_sse.c new file mode 100644 index 000000000..20e73126b --- /dev/null +++ b/media/libopus/celt/x86/pitch_sse.c @@ -0,0 +1,185 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "macros.h" +#include "celt_lpc.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) + +#include <xmmintrin.h> +#include "arch.h" + +void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) +{ + int j; + __m128 xsum1, xsum2; + xsum1 = _mm_loadu_ps(sum); + xsum2 = _mm_setzero_ps(); + + for (j = 0; j < len-3; j += 4) + { + __m128 x0 = _mm_loadu_ps(x+j); + __m128 yj = _mm_loadu_ps(y+j); + __m128 y3 = _mm_loadu_ps(y+j+3); + + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), + _mm_shuffle_ps(yj,y3,0x49))); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), + _mm_shuffle_ps(yj,y3,0x9e))); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); + } + if (j < len) + { + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + if (++j < len) + { + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + if (++j < len) + { + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + } + } + } + _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); +} + + +void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, + int N, opus_val32 *xy1, opus_val32 *xy2) +{ + int i; + __m128 xsum1, xsum2; + xsum1 = _mm_setzero_ps(); + xsum2 = _mm_setzero_ps(); + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 y1i = _mm_loadu_ps(y01+i); + __m128 y2i = _mm_loadu_ps(y02+i); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); + } + /* Horizontal sum */ + xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); + xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); + _mm_store_ss(xy1, xsum1); + xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); + xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); + _mm_store_ss(xy2, xsum2); + for (;i<N;i++) + { + *xy1 = MAC16_16(*xy1, x[i], y01[i]); + *xy2 = MAC16_16(*xy2, x[i], y02[i]); + } +} + +opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, + int N) +{ + int i; + float xy; + __m128 sum; + sum = _mm_setzero_ps(); + /* FIXME: We should probably go 8-way and use 2 sums. */ + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 yi = _mm_loadu_ps(y+i); + sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); + } + /* Horizontal sum */ + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); + _mm_store_ss(&xy, sum); + for (;i<N;i++) + { + xy = MAC16_16(xy, x[i], y[i]); + } + return xy; +} + +void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12) +{ + int i; + __m128 x0v; + __m128 g10v, g11v, g12v; + g10v = _mm_load1_ps(&g10); + g11v = _mm_load1_ps(&g11); + g12v = _mm_load1_ps(&g12); + x0v = _mm_loadu_ps(&x[-T-2]); + for (i=0;i<N-3;i+=4) + { + __m128 yi, yi2, x1v, x2v, x3v, x4v; + const opus_val32 *xp = &x[i-T-2]; + yi = _mm_loadu_ps(x+i); + x4v = _mm_loadu_ps(xp+4); +#if 0 + /* Slower version with all loads */ + x1v = _mm_loadu_ps(xp+1); + x2v = _mm_loadu_ps(xp+2); + x3v = _mm_loadu_ps(xp+3); +#else + x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); + x1v = _mm_shuffle_ps(x0v, x2v, 0x99); + x3v = _mm_shuffle_ps(x2v, x4v, 0x99); +#endif + + yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); +#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ + yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); + yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); +#else + /* Use partial sums */ + yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), + _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); + yi = _mm_add_ps(yi, yi2); +#endif + x0v=x4v; + _mm_storeu_ps(y+i, yi); + } +#ifdef CUSTOM_MODES + for (;i<N;i++) + { + y[i] = x[i] + + MULT16_32_Q15(g10,x[i-T]) + + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) + + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); + } +#endif +} + + +#endif |