diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c')
-rw-r--r-- | third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c | 334 |
1 files changed, 334 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c new file mode 100644 index 000000000..5166e9e0a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdlib.h> +#include <emmintrin.h> +#include <tmmintrin.h> + +#include "aom_ports/mem.h" +#include "./aom_config.h" +#include "aom/aom_integer.h" + +static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) { + __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); + __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); + return _mm_unpacklo_epi64(temp1, temp2); +} + +static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) { + __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr); + __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride)); + __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2); + temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2)); + temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3)); + temp1 = _mm_unpacklo_epi32(temp1, temp2); + return _mm_unpacklo_epi64(temp3, temp1); +} + +static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height); + +static INLINE unsigned int masked_sad8xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +static INLINE unsigned int masked_sad4xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +#define MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \ + m, n); \ + } + +#if CONFIG_EXT_PARTITION +MASKSADMXN_SSSE3(128, 128) +MASKSADMXN_SSSE3(128, 64) +MASKSADMXN_SSSE3(64, 128) +#endif // CONFIG_EXT_PARTITION +MASKSADMXN_SSSE3(64, 64) +MASKSADMXN_SSSE3(64, 32) +MASKSADMXN_SSSE3(32, 64) +MASKSADMXN_SSSE3(32, 32) +MASKSADMXN_SSSE3(32, 16) +MASKSADMXN_SSSE3(16, 32) +MASKSADMXN_SSSE3(16, 16) +MASKSADMXN_SSSE3(16, 8) + +#define MASKSAD8XN_SSSE3(n) \ + unsigned int aom_masked_sad8x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } + +MASKSAD8XN_SSSE3(16) +MASKSAD8XN_SSSE3(8) +MASKSAD8XN_SSSE3(4) + +#define MASKSAD4XN_SSSE3(n) \ + unsigned int aom_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } + +MASKSAD4XN_SSSE3(8) +MASKSAD4XN_SSSE3(4) + +// For width a multiple of 16 +// Assumes values in m are <=64 +static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height) { + int y, x; + __m128i a, b, m, temp1, temp2; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // For each row + for (y = 0; y < height; y++) { + // Covering the full width + for (x = 0; x < width; x += 16) { + // Load a, b, m in xmm registers + a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); + b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); + m = _mm_loadu_si128((const __m128i *)(m_ptr + x)); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + temp2 = _mm_maddubs_epi16(temp1, m); + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one)); + } + // Move onto the next row + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int masked_sad8xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { + int y; + __m128i a, b, m, temp1, temp2, row_res; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // Add the masked SAD for 2 rows at a time + for (y = 0; y < height; y += 2) { + // Load a, b, m in xmm registers + a = width8_load_2rows(a_ptr, a_stride); + b = width8_load_2rows(b_ptr, b_stride); + m = width8_load_2rows(m_ptr, m_stride); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + row_res = _mm_maddubs_epi16(temp1, m); + + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); + + // Move onto the next rows + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int masked_sad4xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { + int y; + __m128i a, b, m, temp1, temp2, row_res; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // Add the masked SAD for 4 rows at a time + for (y = 0; y < height; y += 4) { + // Load a, b, m in xmm registers + a = width4_load_4rows(a_ptr, a_stride); + b = width4_load_4rows(b_ptr, b_stride); + m = width4_load_4rows(m_ptr, m_stride); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + row_res = _mm_maddubs_epi16(temp1, m); + + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); + + // Move onto the next rows + a_ptr += a_stride * 4; + b_ptr += b_stride * 4; + m_ptr += m_stride * 4; + } + // Pad out row result to 32 bit integers & add to running total + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +#if CONFIG_HIGHBITDEPTH +static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr, + int stride) { + __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); + __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); + return _mm_unpacklo_epi64(temp1, temp2); +} + +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, int height); + +static INLINE unsigned int highbd_masked_sad4xh_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +#define HIGHBD_MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, m, n); \ + } + +#if CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN_SSSE3(128, 128) +HIGHBD_MASKSADMXN_SSSE3(128, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 128) +#endif // CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN_SSSE3(64, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 64) +HIGHBD_MASKSADMXN_SSSE3(32, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 32) +HIGHBD_MASKSADMXN_SSSE3(16, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 16) +HIGHBD_MASKSADMXN_SSSE3(8, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 4) + +#define HIGHBD_MASKSAD4XN_SSSE3(n) \ + unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } + +HIGHBD_MASKSAD4XN_SSSE3(8) +HIGHBD_MASKSAD4XN_SSSE3(4) + +// For width a multiple of 8 +// Assumes values in m are <=64 +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, int height) { + int y, x; + __m128i a, b, m, temp1, temp2; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); + __m128i res = _mm_setzero_si128(); + // For each row + for (y = 0; y < height; y++) { + // Covering the full width + for (x = 0; x < width; x += 8) { + // Load a, b, m in xmm registers + a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); + b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); + m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)), + _mm_setzero_si128()); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu16(a, b); + temp2 = _mm_subs_epu16(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Add result of multiplying by m and add pairs together to running total + res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); + } + // Move onto the next row + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int highbd_masked_sad4xh_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { + int y; + __m128i a, b, m, temp1, temp2; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); + __m128i res = _mm_setzero_si128(); + // Add the masked SAD for 2 rows at a time + for (y = 0; y < height; y += 2) { + // Load a, b, m in xmm registers + a = highbd_width4_load_2rows(a_ptr, a_stride); + b = highbd_width4_load_2rows(b_ptr, b_stride); + temp1 = _mm_loadl_epi64((const __m128i *)m_ptr); + temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)); + m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2), + _mm_setzero_si128()); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu16(a, b); + temp2 = _mm_subs_epu16(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); + + // Move onto the next rows + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} +#endif // CONFIG_HIGHBITDEPTH |