summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/x86/convolve_2d_sse2.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1/common/x86/convolve_2d_sse2.c')
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_sse2.c26
1 files changed, 20 insertions, 6 deletions
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index 46c2674ca..e4d352c0e 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -31,6 +31,7 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
int i, j;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
const __m128i zero = _mm_setzero_si128();
@@ -181,9 +182,15 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
// Accumulate values into the destination buffer
__m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
- _mm_storeu_si128(p + 1,
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ if (do_average) {
+ _mm_storeu_si128(p + 0,
+ _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
+ _mm_storeu_si128(p + 1,
+ _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ } else {
+ _mm_storeu_si128(p + 0, res_lo_round);
+ _mm_storeu_si128(p + 1, res_hi_round);
+ }
}
}
}
@@ -204,6 +211,7 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
int i, j;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
const __m128i zero = _mm_setzero_si128();
@@ -357,9 +365,15 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
// Accumulate values into the destination buffer
__m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
- _mm_storeu_si128(p + 1,
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ if (do_average) {
+ _mm_storeu_si128(p + 0,
+ _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
+ _mm_storeu_si128(p + 1,
+ _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ } else {
+ _mm_storeu_si128(p + 0, res_lo_round);
+ _mm_storeu_si128(p + 1, res_hi_round);
+ }
}
}
}