summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/variance_sse2.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/aom_dsp/x86/variance_sse2.c')
-rw-r--r--third_party/aom/aom_dsp/x86/variance_sse2.c166
1 files changed, 153 insertions, 13 deletions
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index 7e3c5d5db..3c37e77c0 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -16,6 +16,7 @@
#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"
+#include "aom_dsp/blend.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_ports/mem.h"
@@ -485,7 +486,8 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
int mi_row, int mi_col, const MV *const mv,
uint8_t *comp_pred, int width, int height,
int subpel_x_q3, int subpel_y_q3,
- const uint8_t *ref, int ref_stride) {
+ const uint8_t *ref, int ref_stride,
+ int subpel_search) {
// expect xd == NULL only in tests
if (xd != NULL) {
const MB_MODE_INFO *mi = xd->mi[0];
@@ -553,7 +555,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
// Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
const InterpFilters filters =
av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
@@ -570,7 +572,10 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
}
const InterpFilterParams *filter =
- av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ (subpel_search == 1)
+ ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+ : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS;
if (!subpel_x_q3 && !subpel_y_q3) {
if (width >= 16) {
@@ -632,15 +637,25 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
const int16_t *const kernel_y =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- const int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+ uint8_t *temp_start_horiz =
+ (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp;
+ uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+ int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
- width, intermediate_height);
- aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
- MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
- width, height);
+ // TODO(Deepa): Remove the memset below when we have
+ // 4 tap simd for sse2 and ssse3.
+ if (subpel_search == 1) {
+ memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width);
+ }
+ aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+ kernel_x, 16, NULL, -1, width, intermediate_height);
+ aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+ kernel_y, 16, width, height);
}
}
@@ -648,11 +663,11 @@ void aom_comp_avg_upsampled_pred_sse2(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride) {
+ int ref_stride, int subpel_search) {
int n;
int i;
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride);
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
/*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
assert(!(width * height & 15));
n = width * height >> 4;
@@ -664,3 +679,128 @@ void aom_comp_avg_upsampled_pred_sse2(
pred += 16;
}
}
+
+void aom_comp_mask_upsampled_pred_sse2(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+ int subpel_search) {
+ if (subpel_x_q3 | subpel_y_q3) {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+ ref = comp_pred;
+ ref_stride = width;
+ }
+ aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+ mask_stride, invert_mask);
+}
+
+static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
+ const __m128i s1,
+ const __m128i a) {
+ const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
+
+ const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
+ const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
+ const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
+ const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
+ const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
+
+ return comp;
+}
+
+void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ int i = 0;
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ const __m128i zero = _mm_setzero_si128();
+
+ if (width == 8) {
+ do {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+ const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
+ const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
+
+ _mm_storeu_si128((__m128i *)comp_pred, comp);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+ const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+ const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
+
+ const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
+ const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+ const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+ _mm_storeu_si128((__m128i *)comp_pred, comp);
+ _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else if (width == 32) {
+ do {
+ for (int j = 0; j < 2; j++) {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
+ const __m128i s2 =
+ _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
+ const __m128i s3 =
+ _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
+
+ const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
+ const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+ const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+ _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+ _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+ }
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ }
+}