summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/aom_dsp/x86/obmc_variance_sse4.c')
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_sse4.c41
1 files changed, 1 insertions, 40 deletions
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
index 2e2f6e09f..72eda0e57 100644
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -19,7 +19,7 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
#include "aom_dsp/x86/synonyms.h"
////////////////////////////////////////////////////////////////////////////////
@@ -36,45 +36,6 @@ void aom_var_filter_block2d_bil_second_pass_ssse3(
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter);
-static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
- const int32_t *wsrc, const int32_t *mask,
- unsigned int *const sse, int *const sum,
- const int h) {
- const int pre_step = pre_stride - 4;
- int n = 0;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_d = _mm_setzero_si128();
-
- assert(IS_POWER_OF_TWO(h));
-
- do {
- const __m128i v_p_b = xx_loadl_32(pre + n);
- const __m128i v_m_d = xx_load_128(mask + n);
- const __m128i v_w_d = xx_load_128(wsrc + n);
-
- const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
- const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
- const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
- const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
-
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
- v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
- n += 4;
-
- if (n % 4 == 0) pre += pre_step;
- } while (n < 4 * h);
-
- *sum = xx_hsum_epi32_si32(v_sum_d);
- *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
const int32_t *wsrc, const int32_t *mask,
unsigned int *const sse, int *const sum,