diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 23:05:00 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 23:05:03 -0500 |
commit | d2499ead93dc4298c0882fe98902acb1b5209f99 (patch) | |
tree | cb0b942aed59e5108f9a3e9d64e7b77854383421 /third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | |
parent | 41fbdea457bf50c0a43e1c27c5cbf7f0a3a9eb33 (diff) | |
download | UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.gz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.lz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.xz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.zip |
Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591
Diffstat (limited to 'third_party/aom/aom_dsp/x86/obmc_variance_sse4.c')
-rw-r--r-- | third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | 41 |
1 files changed, 1 insertions, 40 deletions
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 2e2f6e09f..72eda0e57 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -19,7 +19,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" #include "aom_dsp/x86/synonyms.h" //////////////////////////////////////////////////////////////////////////////// @@ -36,45 +36,6 @@ void aom_var_filter_block2d_bil_second_pass_ssse3( unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter); -static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, - const int32_t *wsrc, const int32_t *mask, - unsigned int *const sse, int *const sum, - const int h) { - const int pre_step = pre_stride - 4; - int n = 0; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_d = _mm_setzero_si128(); - - assert(IS_POWER_OF_TWO(h)); - - do { - const __m128i v_p_b = xx_loadl_32(pre + n); - const __m128i v_m_d = xx_load_128(mask + n); - const __m128i v_w_d = xx_load_128(wsrc + n); - - const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); - - const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); - const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); - const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); - - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); - v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); - - n += 4; - - if (n % 4 == 0) pre += pre_step; - } while (n < 4 * h); - - *sum = xx_hsum_epi32_si32(v_sum_d); - *sse = xx_hsum_epi32_si32(v_sse_d); -} - static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, |