path: root/third_party/aom/aom_dsp/x86
diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86')
51 files changed, 5060 insertions, 2751 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
index 401fbdc48..5f5bf5f14 100644
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -22,6 +22,13 @@ filter8_1dfunction aom_filter_block1d8_h8_sse2;
filter8_1dfunction aom_filter_block1d4_v8_sse2;
filter8_1dfunction aom_filter_block1d4_h8_sse2;
+#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2
+#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2
+#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2
+#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2
+#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2
+#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2
filter8_1dfunction aom_filter_block1d16_v2_sse2;
filter8_1dfunction aom_filter_block1d16_h2_sse2;
filter8_1dfunction aom_filter_block1d8_v2_sse2;
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index f3fe50372..94b5da171 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -74,6 +74,87 @@ static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
_mm256_extractf128_si256(*a, 1));
+static void aom_filter_block1d4_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+ filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+ src_ptr += src_stride;
+ xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+ // if the number of strides is odd.
+ // process only 4 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // save 4 bytes
+ *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
static void aom_filter_block1d4_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -179,6 +260,100 @@ static void aom_filter_block1d4_h8_avx2(
+static void aom_filter_block1d8_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ // shrink to 8 bit each 16 bits
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
+ src_ptr += src_stride;
+ xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+ // if the number of strides is odd.
+ // process only 8 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+ }
static void aom_filter_block1d8_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -311,6 +486,121 @@ static void aom_filter_block1d8_h8_avx2(
+static void aom_filter_block1d16_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 =
+ xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+ src_ptr += src_stride;
+ xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m256i srcReg1, srcReg12;
+ __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
+ srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
+ srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
+ // filter the source buffer
+ srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
+ srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
+ srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
+ srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
+ srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr,
+ _mm256_castsi256_si128(srcRegFilt1_1));
+ }
static void aom_filter_block1d16_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -507,6 +797,92 @@ static void aom_filter_block1d16_h8_avx2(
+static void aom_filter_block1d8_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i resReg23_34_lo, resReg45_56_lo;
+ __m256i resReglo, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+ src_ptr += src_stride;
+ xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
+ output_ptr += dst_stride;
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
static void aom_filter_block1d8_v8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -659,6 +1035,104 @@ static void aom_filter_block1d8_v8_avx2(
+static void aom_filter_block1d16_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
+ __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
+ __m256i resReglo, resReghi, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+ srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+ srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+ srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
+ resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
+ // add and saturate the results together
+ resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+ resReghi = _mm256_srai_epi16(resReghi, 6);
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReghi);
+ src_ptr += src_stride;
+ xx_store2_mi128(output_ptr, out_pitch, &resReg);
+ output_ptr += dst_stride;
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg23_34_hi = srcReg45_56_hi;
+ srcReg4x = srcReg6x;
+ }
static void aom_filter_block1d16_v8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -854,6 +1328,88 @@ static void aom_filter_block1d16_v8_avx2(
+static void aom_filter_block1d4_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i srcReg2345_3456_lo;
+ __m256i resReglo, resReg;
+ __m256i firstFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+ srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+ // multiply 2 adjacent elements with the filter and add the result
+ resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+ resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+ src_ptr += src_stride;
+ xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
+ output_ptr += dst_stride;
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
filter8_1dfunction aom_filter_block1d4_v8_ssse3;
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 6bcb4a512..325a21b76 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -287,6 +287,13 @@ filter8_1dfunction aom_filter_block1d8_h8_ssse3;
filter8_1dfunction aom_filter_block1d4_v8_ssse3;
filter8_1dfunction aom_filter_block1d4_h8_ssse3;
+#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
+#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
+#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3
+#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3
+#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3
+#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
filter8_1dfunction aom_filter_block1d16_h2_ssse3;
filter8_1dfunction aom_filter_block1d8_v2_ssse3;
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
new file mode 100644
index 000000000..67fb4d32b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -0,0 +1,900 @@
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include <smmintrin.h> // SSE4.1
+#include <immintrin.h> // AVX2
+#include <assert.h>
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+#include "config/aom_dsp_rtcd.h"
+static INLINE void blend_a64_d16_mask_w16_avx2(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
+ int shift) {
+ const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+ const __m256i s0_0 = yy_loadu_256(src0);
+ const __m256i s1_0 = yy_loadu_256(src1);
+ __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+ _mm256_unpacklo_epi16(*m0, max_minus_m0));
+ __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+ _mm256_unpackhi_epi16(*m0, max_minus_m0));
+ res0_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+ const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+ __m256i res = _mm256_packus_epi16(res0, res0);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
+static INLINE void blend_a64_d16_mask_w32_avx2(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
+ const __m256i *v_maxval, int shift) {
+ const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+ const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
+ const __m256i s0_0 = yy_loadu_256(src0);
+ const __m256i s0_1 = yy_loadu_256(src0 + 16);
+ const __m256i s1_0 = yy_loadu_256(src1);
+ const __m256i s1_1 = yy_loadu_256(src1 + 16);
+ __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+ _mm256_unpacklo_epi16(*m0, max_minus_m0));
+ __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+ _mm256_unpackhi_epi16(*m0, max_minus_m0));
+ __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
+ _mm256_unpacklo_epi16(*m1, max_minus_m1));
+ __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
+ _mm256_unpackhi_epi16(*m1, max_minus_m1));
+ res0_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+ res1_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
+ res1_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
+ const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+ const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
+ __m256i res = _mm256_packus_epi16(res0, res1);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ _mm256_storeu_si256((__m256i *)(dst), res);
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m = xx_loadu_128(mask);
+ const __m256i m0 = _mm256_cvtepu8_epi16(m);
+ blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m = yy_loadu_256(mask + j);
+ const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
+ const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m256i m_i00 = yy_loadu_256(mask);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
+ const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+ const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+ const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+ blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
+ const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
+ const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+ const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
+ const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+ const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
+ const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+ const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+ const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+ blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+ const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+ const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
+ const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+ const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + j);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+ const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+ const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
+ blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + j);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
+ const __m256i m_ac =
+ _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
+ const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
+ const __m256i m1 =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+void aom_lowbd_blend_a64_d16_mask_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+ const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+ const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
+ if (subw == 0 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else if (subw == 1 && subh == 1) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else if (subw == 1 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ }
+static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+ const __m256i *v_m0_b,
+ const __m256i *v_m1_b,
+ const int32_t bits) {
+ const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
+ const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
+ const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
+ const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
+ const __m256i v_p0_w =
+ _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
+ _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+ const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
+ const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
+ return v_res;
+static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+ const __m256i *v_m0_b,
+ const __m256i *v_m1_b,
+ const int32_t bits) {
+ const __m256i v_s0_b = yy_loadu_256(src0);
+ const __m256i v_s1_b = yy_loadu_256(src1);
+ const __m256i v_p0_w =
+ _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m256i v_p1_w =
+ _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
+ _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
+ const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+ const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
+ const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
+ return v_res;
+static INLINE void blend_a64_mask_sx_sy_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ const __m256i v_ral_b = yy_loadu_256(mask);
+ const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
+ const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+ const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+ const __m256i v_rvsbl_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+ const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+ const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
+ const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+ const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+ xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
+ const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
+ const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
+ const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
+ const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+ const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
+ const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+ const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
+ const __m256i v_rvsbl_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+ const __m256i v_rvsbh_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
+ const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+ const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
+ const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
+ const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
+ const __m256i v_m0_b =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+static INLINE void blend_a64_mask_sx_sy_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_32(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_64(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ break;
+ }
+static INLINE void blend_a64_mask_sx_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
+ do {
+ const __m256i v_rl_b = yy_loadu_256(mask);
+ const __m256i v_al_b =
+ _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
+ const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
+ const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+ const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+ xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+static INLINE void blend_a64_mask_sx_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
+ const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
+ const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
+ const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
+ const __m256i v_al_b =
+ _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
+ const __m256i v_ah_b =
+ _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
+ const __m256i v_m0_b =
+ _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+static INLINE void blend_a64_mask_sx_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_r_b = xx_loadl_64(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_32(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_r_b = xx_loadu_128(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_64(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ break;
+ }
+static INLINE void blend_a64_mask_sy_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storeu_128(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+static INLINE void blend_a64_mask_sy_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_ra_b = yy_loadu_256(mask + c);
+ const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
+ const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+static INLINE void blend_a64_mask_sy_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_32(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_64(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ }
+static INLINE void blend_a64_mask_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_m0_b = yy_loadu_256(mask + c);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+static INLINE void blend_a64_mask_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_m0_b = xx_loadl_32(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_32(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_m0_b = xx_loadl_64(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_64(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ do {
+ const __m128i v_m0_b = xx_loadu_128(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storeu_128(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ default:
+ blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ }
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subx, int suby) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h, subx, suby);
+ } else {
+ if (subx & suby) {
+ blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else if (subx) {
+ blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else if (suby) {
+ blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else {
+ blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h);
+ }
+ }
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
index 49c20b467..9d6b4c2f7 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -20,6 +20,7 @@
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
#include "config/aom_dsp_rtcd.h"
@@ -32,19 +33,13 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_m0_b = xx_loadl_32(mask);
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_32(dst, v_res_b);
dst += dst_stride;
@@ -59,19 +54,13 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_m0_b = xx_loadl_64(mask);
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_64(dst, v_res_b);
dst += dst_stride;
@@ -85,23 +74,17 @@ static void blend_a64_mask_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
int c;
for (c = 0; c < w; c += 16) {
- const __m128i v_m0l_b = xx_loadl_64(mask + c);
- const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
- const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
- const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
- const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
- const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
- const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
+ const __m128i v_m0_b = xx_loadu_128(mask + c);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
xx_storeu_128(dst + c, v_res_b);
@@ -120,23 +103,20 @@ static void blend_a64_mask_sx_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_r_b = xx_loadl_64(mask);
- const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
- const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_32(dst, v_res_b);
dst += dst_stride;
@@ -150,22 +130,20 @@ static void blend_a64_mask_sx_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_r_b = xx_loadu_128(mask);
- const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
- const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_64(dst, v_res_b);
@@ -180,28 +158,24 @@ static void blend_a64_mask_sx_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
int c;
for (c = 0; c < w; c += 16) {
- const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
- const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
- const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
- const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
- const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
- const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
- const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
- const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
- const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
- const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+ const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
+ const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
xx_storeu_128(dst + c, v_res_b);
@@ -220,21 +194,18 @@ static void blend_a64_mask_sy_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_ra_b = xx_loadl_32(mask);
const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
- const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_32(dst, v_res_b);
@@ -249,21 +220,16 @@ static void blend_a64_mask_sy_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_ra_b = xx_loadl_64(mask);
const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
- const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_64(dst, v_res_b);
@@ -278,26 +244,18 @@ static void blend_a64_mask_sy_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zero = _mm_setzero_si128();
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
int c;
for (c = 0; c < w; c += 16) {
const __m128i v_ra_b = xx_loadu_128(mask + c);
const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
- const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
- const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
- const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
- const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
- const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
- const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
xx_storeu_128(dst + c, v_res_b);
@@ -316,27 +274,24 @@ static void blend_a64_mask_sx_sy_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_ra_b = xx_loadl_64(mask);
const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
- const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
- const __m128i v_rvsb_w =
- _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
- const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_32(dst, v_res_b);
@@ -351,27 +306,25 @@ static void blend_a64_mask_sx_sy_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_ra_b = xx_loadu_128(mask);
const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
- const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
- const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
- const __m128i v_rvsb_w =
- _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
- const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_64(dst, v_res_b);
@@ -388,8 +341,8 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
int c;
for (c = 0; c < w; c += 16) {
@@ -410,14 +363,11 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
- const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
- const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
- const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
xx_storeu_128(dst + c, v_res_b);
@@ -921,24 +871,140 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
-static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0,
- const CONV_BUF_TYPE *src1,
- const __m128i *m,
- const __m128i *v_round_offset,
- const __m128i *v_maxval, int round_bits) {
- const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
- const __m128i s0 = xx_loadl_64(src0);
- const __m128i s1 = xx_loadl_64(src1);
- const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
- const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
- const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
- const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS);
- const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset);
- const __m128i res_d = xx_roundn_epi32(res_c, round_bits);
- const __m128i res_e = _mm_packs_epi32(res_d, res_d);
- const __m128i res = _mm_packus_epi16(res_e, res_e);
- xx_storel_32(dst, res);
+static INLINE void blend_a64_d16_mask_w16_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
+ const __m128i *v_maxval, int shift) {
+ const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
+ const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
+ const __m128i s0_0 = xx_loadu_128(src0);
+ const __m128i s0_1 = xx_loadu_128(src0 + 8);
+ const __m128i s1_0 = xx_loadu_128(src1);
+ const __m128i s1_1 = xx_loadu_128(src1 + 8);
+ __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
+ _mm_unpacklo_epi16(*m0, max_minus_m0));
+ __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
+ _mm_unpackhi_epi16(*m0, max_minus_m0));
+ __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
+ _mm_unpacklo_epi16(*m1, max_minus_m1));
+ __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
+ _mm_unpackhi_epi16(*m1, max_minus_m1));
+ res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
+ res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
+ res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
+ const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
+ const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
+ const __m128i res = _mm_packus_epi16(res0, res1);
+ _mm_storeu_si128((__m128i *)(dst), res);
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m = xx_loadu_128(mask + j);
+ const __m128i m0 = _mm_cvtepu8_epi16(m);
+ const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+ const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+ const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+ const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+ const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+ const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+ const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+ const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+ const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+ const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+ const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
+ const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
+ const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
+ const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + j);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+ const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+ const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
+ const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
void aom_lowbd_blend_a64_d16_mask_sse4_1(
@@ -947,12 +1013,15 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1(
const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
ConvolveParams *conv_params) {
const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
@@ -961,69 +1030,80 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1(
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i v_ro_a = xx_loadl_32(&round_offset);
- const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0);
- const __m128i one_w = _mm_set1_epi16(1);
- const __m128i one_b = _mm_set1_epi8(1);
- const __m128i two_w = _mm_set1_epi16(2);
+ const __m128i v_round_offset = _mm_set1_epi32(round_offset);
if (subw == 0 && subh == 0) {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 4) {
- const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]);
- const __m128i m = _mm_cvtepu8_epi16(m0);
- blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
- &src1[i * src1_stride + j], &m, &v_round_offset,
- &v_maxval, round_bits);
- }
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
} else if (subw == 1 && subh == 1) {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 4) {
- const __m128i m_i0 =
- xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]);
- const __m128i m_i1 =
- xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]);
- const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
- const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b);
- const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd);
- const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
- const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
- blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
- &src1[i * src1_stride + j], &m, &v_round_offset,
- &v_maxval, round_bits);
- }
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
} else if (subw == 1 && subh == 0) {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 4) {
- const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]);
- const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
- const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
- const __m128i m = _mm_srli_epi16(m_ac_1, 1);
- blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
- &src1[i * src1_stride + j], &m, &v_round_offset,
- &v_maxval, round_bits);
- }
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
} else {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 4) {
- const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]);
- const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]);
- const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1);
- const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b);
- const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
- const __m128i m = _mm_srli_epi16(m_ac_1, 1);
- blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
- &src1[i * src1_stride + j], &m, &v_round_offset,
- &v_maxval, round_bits);
- }
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
index 59506bdfe..064910232 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -39,7 +39,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
const __m128i v_m0_w = _mm_set1_epi16(*mask);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+ const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
@@ -64,7 +64,7 @@ static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
const __m128i v_m0_w = _mm_set1_epi16(*mask);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+ const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
@@ -90,9 +90,9 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
const __m128i v_m0_w = _mm_set1_epi16(*mask);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
for (c = 0; c < w; c += 16) {
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w);
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w);
+ blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
new file mode 100644
index 000000000..c071fdcfc
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
@@ -0,0 +1,237 @@
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include <smmintrin.h> // SSE4.1
+#include <assert.h>
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "config/aom_dsp_rtcd.h"
+static INLINE void blend_a64_d16_mask_w4_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+ int shift) {
+ const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+ const __m128i s0 = xx_loadl_64(src0);
+ const __m128i s1 = xx_loadl_64(src1);
+ const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+ const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+ const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+ const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
+ const __m128i res_d = _mm_srai_epi32(res_c, shift);
+ const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+ const __m128i res = _mm_packus_epi16(res_e, res_e);
+ xx_storel_32(dst, res);
+static INLINE void blend_a64_d16_mask_w8_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+ int shift) {
+ const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+ const __m128i s0 = xx_loadu_128(src0);
+ const __m128i s1 = xx_loadu_128(src1);
+ __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
+ _mm_unpacklo_epi16(*m, max_minus_m));
+ __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
+ _mm_unpackhi_epi16(*m, max_minus_m));
+ res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
+ res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
+ const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
+ const __m128i res = _mm_packus_epi16(res_e, res_e);
+ _mm_storel_epi64((__m128i *)(dst), res);
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m0 = xx_loadl_32(mask);
+ const __m128i m = _mm_cvtepu8_epi16(m0);
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m0 = xx_loadl_64(mask);
+ const __m128i m = _mm_cvtepu8_epi16(m0);
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+ const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+ const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadu_128(mask);
+ const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+ const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+ const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+ const __m128i m = _mm_avg_epu16(m_ac, zeros);
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadu_128(mask);
+ const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+ const __m128i m = _mm_avg_epu16(m_ac, zeros);
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
index 4880438bc..8d9b32510 100644
--- a/third_party/aom/aom_dsp/x86/blend_sse4.h
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -9,42 +9,44 @@
* PATENTS file, you can obtain it at
-#ifndef AOM_DSP_X86_BLEND_SSE4_H_
-#define AOM_DSP_X86_BLEND_SSE4_H_
+#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
#include "aom_dsp/blend.h"
#include "aom_dsp/x86/synonyms.h"
+static const uint8_t g_blend_a64_mask_shuffle[32] = {
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
// Common kernels
static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i *v_m0_w, const __m128i *v_m1_w) {
const __m128i v_s0_b = xx_loadl_32(src0);
const __m128i v_s1_b = xx_loadl_32(src1);
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
return v_res_w;
static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i *v_m0_w, const __m128i *v_m1_w) {
const __m128i v_s0_b = xx_loadl_64(src0);
const __m128i v_s1_b = xx_loadl_64(src1);
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
@@ -53,6 +55,51 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
return v_res_w;
+static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadl_32(src0);
+ const __m128i v_s1_b = xx_loadl_32(src1);
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+ return v_res;
+static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadl_64(src0);
+ const __m128i v_s1_b = xx_loadl_64(src1);
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+ return v_res;
+static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadu_128(src0);
+ const __m128i v_s1_b = xx_loadu_128(src1);
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
+ _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
+ const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
+ return v_res;
typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w);
@@ -141,4 +188,4 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
return v_res_w;
-#endif // AOM_DSP_X86_BLEND_SSE4_H_
+#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
index 3f46420dd..96fe4ebb6 100644
--- a/third_party/aom/aom_dsp/x86/common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
-#ifndef AOM_DSP_X86_COMMON_AVX2_H
-#define AOM_DSP_X86_COMMON_AVX2_H
#include <immintrin.h>
@@ -144,4 +144,4 @@ static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
index 36fb1963a..3e19682cd 100644
--- a/third_party/aom/aom_dsp/x86/convolve.h
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at
-#ifndef AOM_DSP_X86_CONVOLVE_H_
-#define AOM_DSP_X86_CONVOLVE_H_
#include <assert.h>
@@ -17,7 +17,6 @@
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
-#include "aom_dsp/aom_convolve.h"
typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
uint8_t *output_ptr, ptrdiff_t out_pitch,
@@ -34,7 +33,30 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
(void)y_step_q4; \
assert((-128 <= filter[3]) && (filter[3] <= 127)); \
assert(step_q4 == 16); \
- if (filter[0] | filter[1] | filter[2]) { \
+ if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
+ (filter[2] | filter[5])) { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else if (filter[0] | filter[1] | filter[2]) { \
while (w >= 16) { \
aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
@@ -153,4 +175,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
} \
-#endif // AOM_DSP_X86_CONVOLVE_H_
+#endif // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
index 72fabd236..30253f65c 100644
--- a/third_party/aom/aom_dsp/x86/convolve_avx2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
// filters for 16
DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
@@ -29,6 +29,11 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
static INLINE void prepare_coeffs_lowbd(
const InterpFilterParams *const filter_params, const int subpel_q4,
__m256i *const coeffs /* [4] */) {
@@ -191,4 +196,4 @@ static INLINE __m256i highbd_convolve_rounding(
return res_round;
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
index e80c5872f..707bd2d78 100644
--- a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
+++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
// Note:
// This header file should be put below any x86 intrinsics head file
@@ -28,4 +28,4 @@ static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
_mm_store_si128((__m128i *)dst, d);
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
index 399df5d6d..445d04b10 100644
--- a/third_party/aom/aom_dsp/x86/convolve_sse2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
// Note:
// This header file should be put below any x86 intrinsics head file
@@ -118,4 +118,4 @@ static INLINE __m128i highbd_convolve_rounding_sse2(
return res_round;
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
index d48c25667..6b8388d84 100644
--- a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
+++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
// Note:
// This header file should be put below any x86 intrinsics head file
@@ -50,4 +50,4 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
return res;
+#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
index 12ccf7f26..260d8dd58 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
-#ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
-#define AOM_DSP_X86_FWD_TXFM_SSE2_H_
#ifdef __cplusplus
extern "C" {
@@ -152,4 +152,4 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
} // extern "C"
-#endif // AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
deleted file mode 100644
index 99f17ebdf..000000000
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
+++ /dev/null
@@ -1,351 +0,0 @@
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at
-%include "aom_ports/x86_abi_support.asm"
-SECTION .text
-;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
-; int ref_stride,
-; unsigned char *src,
-; int src_stride,
-; unsigned int height,
-; int *sum,
-; unsigned int *sumsquared)
-global sym(aom_half_horiz_vert_variance16x_h_sse2) PRIVATE
- push rbp
- mov rbp, rsp
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref
- mov rdi, arg(2) ;src
- movsxd rcx, dword ptr arg(4) ;height
- movsxd rax, dword ptr arg(1) ;ref_stride
- movsxd rdx, dword ptr arg(3) ;src_stride
- pxor xmm0, xmm0 ;
- movdqu xmm5, XMMWORD PTR [rsi]
- movdqu xmm3, XMMWORD PTR [rsi+1]
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
- lea rsi, [rsi + rax]
- movdqu xmm1, XMMWORD PTR [rsi] ;
- movdqu xmm2, XMMWORD PTR [rsi+1] ;
- pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
- pavgb xmm5, xmm1 ; xmm = vertical average of the above
- movdqa xmm4, xmm5
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
- punpckhbw xmm4, xmm0
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- movq xmm3, QWORD PTR [rdi+8]
- punpcklbw xmm3, xmm0
- psubw xmm4, xmm3
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm4
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm4, xmm4
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm4
- movdqa xmm5, xmm1 ; save xmm1 for use on the next row
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- sub rcx, 1 ;
- jnz aom_half_horiz_vert_variance16x_h_1 ;
- pxor xmm1, xmm1
- pxor xmm5, xmm5
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
- psrldq xmm7, 8
- psrldq xmm1, 8
- paddd xmm6, xmm7
- paddd xmm0, xmm1
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
- movd [rsi], xmm0
- movd [rdi], xmm6
- ; begin epilog
- pop rdi
- pop rsi
- pop rbp
- ret
-;void aom_half_vert_variance16x_h_sse2(unsigned char *ref,
-; int ref_stride,
-; unsigned char *src,
-; int src_stride,
-; unsigned int height,
-; int *sum,
-; unsigned int *sumsquared)
-global sym(aom_half_vert_variance16x_h_sse2) PRIVATE
- push rbp
- mov rbp, rsp
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref
- mov rdi, arg(2) ;src
- movsxd rcx, dword ptr arg(4) ;height
- movsxd rax, dword ptr arg(1) ;ref_stride
- movsxd rdx, dword ptr arg(3) ;src_stride
- movdqu xmm5, XMMWORD PTR [rsi]
- lea rsi, [rsi + rax ]
- pxor xmm0, xmm0
- movdqu xmm3, XMMWORD PTR [rsi]
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- movdqa xmm4, xmm5
- punpcklbw xmm5, xmm0
- punpckhbw xmm4, xmm0
- movq xmm2, QWORD PTR [rdi]
- punpcklbw xmm2, xmm0
- psubw xmm5, xmm2
- movq xmm2, QWORD PTR [rdi+8]
- punpcklbw xmm2, xmm0
- psubw xmm4, xmm2
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm4
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm4, xmm4
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm4
- movdqa xmm5, xmm3
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- sub rcx, 1
- jnz aom_half_vert_variance16x_h_1
- pxor xmm1, xmm1
- pxor xmm5, xmm5
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
- psrldq xmm7, 8
- psrldq xmm1, 8
- paddd xmm6, xmm7
- paddd xmm0, xmm1
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
- movd [rsi], xmm0
- movd [rdi], xmm6
- ; begin epilog
- pop rdi
- pop rsi
- pop rbp
- ret
-;void aom_half_horiz_variance16x_h_sse2(unsigned char *ref,
-; int ref_stride
-; unsigned char *src,
-; int src_stride,
-; unsigned int height,
-; int *sum,
-; unsigned int *sumsquared)
-global sym(aom_half_horiz_variance16x_h_sse2) PRIVATE
- push rbp
- mov rbp, rsp
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref
- mov rdi, arg(2) ;src
- movsxd rcx, dword ptr arg(4) ;height
- movsxd rax, dword ptr arg(1) ;ref_stride
- movsxd rdx, dword ptr arg(3) ;src_stride
- pxor xmm0, xmm0 ;
- movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
- movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- movdqa xmm1, xmm5
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
- punpckhbw xmm1, xmm0
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
- movq xmm2, QWORD PTR [rdi+8]
- punpcklbw xmm2, xmm0
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- psubw xmm1, xmm2
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm1
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm1, xmm1
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm1
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- sub rcx, 1 ;
- jnz aom_half_horiz_variance16x_h_1 ;
- pxor xmm1, xmm1
- pxor xmm5, xmm5
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
- psrldq xmm7, 8
- psrldq xmm1, 8
- paddd xmm6, xmm7
- paddd xmm0, xmm1
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
- movd [rsi], xmm0
- movd [rdi], xmm6
- ; begin epilog
- pop rdi
- pop rsi
- pop rbp
- ret
-; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
- times 8 dw 64
-align 16
- dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
- dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
- dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
- dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
- dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
- dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
- dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
deleted file mode 100644
index 2a018c1cf..000000000
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
+++ /dev/null
@@ -1,78 +0,0 @@
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at
- */
-#include <assert.h>
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
- int ref_stride,
- const unsigned char *src,
- int src_stride, unsigned int height,
- int *sum, unsigned int *sumsquared);
-void aom_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
- const unsigned char *src, int src_stride,
- unsigned int height, int *sum,
- unsigned int *sumsquared);
-void aom_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
- const unsigned char *src, int src_stride,
- unsigned int height, int *sum,
- unsigned int *sumsquared);
-uint32_t aom_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
- int src_stride,
- const unsigned char *dst,
- int dst_stride, uint32_t *sse) {
- int xsum0;
- unsigned int xxsum0;
- aom_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
- &xsum0, &xxsum0);
- *sse = xxsum0;
- assert(xsum0 <= 255 * 16 * 16);
- assert(xsum0 >= -255 * 16 * 16);
- return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-uint32_t aom_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
- int src_stride,
- const unsigned char *dst,
- int dst_stride, uint32_t *sse) {
- int xsum0;
- unsigned int xxsum0;
- aom_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0,
- &xxsum0);
- *sse = xxsum0;
- assert(xsum0 <= 255 * 16 * 16);
- assert(xsum0 >= -255 * 16 * 16);
- return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-uint32_t aom_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
- int src_stride,
- const unsigned char *dst,
- int dst_stride, uint32_t *sse) {
- int xsum0;
- unsigned int xxsum0;
- aom_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
- &xsum0, &xxsum0);
- *sse = xxsum0;
- assert(xsum0 <= 255 * 16 * 16);
- assert(xsum0 >= -255 * 16 * 16);
- return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 83e0098ba..097e0778f 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -327,6 +327,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
__m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
const unsigned char *lt, const unsigned char *thr, int bd) {
int i;
+ const __m128i zero = _mm_setzero_si128();
__m128i blimit, limit, thresh;
__m128i t80;
get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
@@ -355,13 +356,18 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
flat2 = _mm_unpacklo_epi64(flat2, flat2);
// flat and wide flat calculations
- __m128i flat_p[3], flat_q[3], flat_pq[3];
- __m128i flat2_p[6], flat2_q[6];
- __m128i flat2_pq[6];
- {
- __m128i work0;
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i flat_p[3], flat_q[3], flat_pq[3];
+ __m128i flat2_p[6], flat2_q[6];
+ __m128i flat2_pq[6];
+ __m128i sum_p6, sum_p3;
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
+ __m128i work0, work0_0, work0_1, sum_p_0;
__m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
__m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
sum_p = _mm_add_epi16(sum_p, sum_lp);
@@ -369,30 +375,23 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
__m128i sum_lq = _mm_srli_si128(sum_lp, 8);
__m128i sum_q = _mm_srli_si128(sum_p, 8);
- sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
- work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
- flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0]));
- flat2_q[0] =
- _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0]));
- flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0]));
+ flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
- __m128i sum_p6, sum_p3;
sum_p6 = _mm_add_epi16(pq[6], pq[6]);
sum_p3 = _mm_add_epi16(pq[3], pq[3]);
- sum_q = _mm_sub_epi16(sum_p, p[5]);
- sum_p = _mm_sub_epi16(sum_p, q[5]);
+ sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
+ sum_p = _mm_sub_epi16(sum_p_0, q[5]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
- flat2_p[1] = _mm_add_epi16(sum_p, work0);
- flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+ work0_1 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
- sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+ sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
sum_lp = _mm_sub_epi16(sum_lp, q[2]);
work0 = _mm_add_epi16(sum_p3, pq[1]);
@@ -402,21 +401,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
- flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
- flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
- sum_p = _mm_sub_epi16(sum_p, q[4]);
- sum_q = _mm_sub_epi16(sum_q, p[4]);
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
- flat2_p[2] = _mm_add_epi16(sum_p, work0);
- flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
sum_lp = _mm_sub_epi16(sum_lp, q[1]);
- sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+ sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
work0 = _mm_add_epi16(sum_p3, pq[2]);
@@ -425,54 +411,88 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[3]);
- sum_q = _mm_sub_epi16(sum_q, p[3]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
- flat2_p[3] = _mm_add_epi16(sum_p, work0);
- flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[2]);
- sum_q = _mm_sub_epi16(sum_q, p[2]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
- flat2_p[4] = _mm_add_epi16(sum_p, work0);
- flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[1]);
- sum_q = _mm_sub_epi16(sum_q, p[1]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
- flat2_p[5] = _mm_add_epi16(sum_p, work0);
- flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
- }
- // highbd_filter8
- pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
- pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
- for (i = 0; i < 3; i++) {
- pq[i] = _mm_andnot_si128(flat, pq[i]);
- flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
- pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
- }
- // highbd_filter16
- for (i = 5; i >= 0; i--) {
- // p[i] remains unchanged if !(flat2 && flat && mask)
- pq[i] = _mm_andnot_si128(flat2, pq[i]);
- flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
- // get values for when (flat2 && flat && mask)
- pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values
+ int flat2_mask =
+ (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+ if (flat2_mask) {
+ flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
+ flat2_q[0] = _mm_add_epi16(
+ sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
+ flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+ flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+ flat2_pq[0] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+ flat2_pq[1] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+ sum_p = _mm_sub_epi16(sum_p, q[4]);
+ sum_q = _mm_sub_epi16(sum_q, pq[4]);
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+ flat2_p[2] = _mm_add_epi16(sum_p, work0);
+ flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[2] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[3]);
+ sum_q = _mm_sub_epi16(sum_q, pq[3]);
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+ flat2_p[3] = _mm_add_epi16(sum_p, work0);
+ flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[3] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[2]);
+ sum_q = _mm_sub_epi16(sum_q, pq[2]);
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+ flat2_p[4] = _mm_add_epi16(sum_p, work0);
+ flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[4] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[1]);
+ sum_q = _mm_sub_epi16(sum_q, pq[1]);
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+ flat2_p[5] = _mm_add_epi16(sum_p, work0);
+ flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[5] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+ } // flat2
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // highbd_filter8
+ pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+ pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+ for (i = 0; i < 3; i++) {
+ pq[i] = _mm_andnot_si128(flat, pq[i]);
+ flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+ pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ if (flat2_mask) {
+ for (i = 0; i < 6; i++) {
+ pq[i] = _mm_andnot_si128(flat2, pq[i]);
+ flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+ pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values
+ }
+ }
+ } else {
+ pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+ pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
@@ -500,6 +520,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
const uint8_t *thr1, int bd) {
__m128i blimit, limit, thresh, t80;
+ const __m128i zero = _mm_setzero_si128();
get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
__m128i mask;
@@ -512,27 +534,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
__m128i ps[2], qs[2];
highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
// flat and wide flat calculations
- __m128i flat_p[3], flat_q[3];
- __m128i flat2_p[6], flat2_q[6];
- {
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i flat_p[3], flat_q[3];
+ __m128i flat2_p[6], flat2_q[6];
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
- __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+ __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
__m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
__m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
- sum_p = _mm_add_epi16(sum_p, sum_lp);
+ sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
__m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
sum_q = _mm_add_epi16(sum_q, sum_lq);
- sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
- flat2_p[0] = _mm_srli_epi16(
- _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
- _mm_add_epi16(p[1], q[0]))),
- 4);
- flat2_q[0] = _mm_srli_epi16(
- _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
- _mm_add_epi16(p[0], q[1]))),
- 4);
flat_p[0] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
flat_q[0] =
@@ -541,117 +558,160 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
__m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
__m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
__m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
- sum_q = _mm_sub_epi16(sum_p, p[5]);
- sum_p = _mm_sub_epi16(sum_p, q[5]);
- flat2_p[1] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
- 4);
- flat2_q[1] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
- 4);
+ sum_q = _mm_sub_epi16(sum_p_0, p[5]);
+ __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
sum_lq = _mm_sub_epi16(sum_lp, p[2]);
sum_lp = _mm_sub_epi16(sum_lp, q[2]);
flat_p[1] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
flat_q[1] =
_mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p3 = _mm_add_epi16(sum_p3, p[3]);
- sum_q3 = _mm_add_epi16(sum_q3, q[3]);
- sum_p = _mm_sub_epi16(sum_p, q[4]);
- sum_q = _mm_sub_epi16(sum_q, p[4]);
- flat2_p[2] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
- 4);
- flat2_q[2] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
- 4);
sum_lp = _mm_sub_epi16(sum_lp, q[1]);
sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+ sum_p3 = _mm_add_epi16(sum_p3, p[3]);
+ sum_q3 = _mm_add_epi16(sum_q3, q[3]);
flat_p[2] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
flat_q[2] =
_mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[3]);
- sum_q = _mm_sub_epi16(sum_q, p[3]);
- flat2_p[3] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
- 4);
- flat2_q[3] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[2]);
- sum_q = _mm_sub_epi16(sum_q, p[2]);
- flat2_p[4] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
- 4);
- flat2_q[4] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[1]);
- sum_q = _mm_sub_epi16(sum_q, p[1]);
- flat2_p[5] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
- 4);
- flat2_q[5] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
- 4);
- }
- // highbd_filter8
- p[2] = _mm_andnot_si128(flat, p[2]);
- // p2 remains unchanged if !(flat && mask)
- flat_p[2] = _mm_and_si128(flat, flat_p[2]);
- // when (flat && mask)
- p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
- q[2] = _mm_andnot_si128(flat, q[2]);
- flat_q[2] = _mm_and_si128(flat, flat_q[2]);
- q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
- int i;
- for (i = 1; i >= 0; i--) {
- ps[i] = _mm_andnot_si128(flat, ps[i]);
- flat_p[i] = _mm_and_si128(flat, flat_p[i]);
- p[i] = _mm_or_si128(ps[i], flat_p[i]);
- qs[i] = _mm_andnot_si128(flat, qs[i]);
- flat_q[i] = _mm_and_si128(flat, flat_q[i]);
- q[i] = _mm_or_si128(qs[i], flat_q[i]);
- }
- // highbd_filter16
- for (i = 5; i >= 0; i--) {
- // p[i] remains unchanged if !(flat2 && flat && mask)
- p[i] = _mm_andnot_si128(flat2, p[i]);
- flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
- // get values for when (flat2 && flat && mask)
- p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
- q[i] = _mm_andnot_si128(flat2, q[i]);
- flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
- q[i] = _mm_or_si128(q[i], flat2_q[i]);
+ int flat2_mask =
+ (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+ if (flat2_mask) {
+ flat2_p[0] = _mm_srli_epi16(
+ _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+ _mm_add_epi16(p[1], q[0]))),
+ 4);
+ flat2_q[0] = _mm_srli_epi16(
+ _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+ _mm_add_epi16(p[0], q[1]))),
+ 4);
+ flat2_p[1] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+ 4);
+ flat2_q[1] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[4]);
+ sum_q = _mm_sub_epi16(sum_q, p[4]);
+ flat2_p[2] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+ 4);
+ flat2_q[2] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[3]);
+ sum_q = _mm_sub_epi16(sum_q, p[3]);
+ flat2_p[3] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+ 4);
+ flat2_q[3] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[2]);
+ sum_q = _mm_sub_epi16(sum_q, p[2]);
+ flat2_p[4] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+ 4);
+ flat2_q[4] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[1]);
+ sum_q = _mm_sub_epi16(sum_q, p[1]);
+ flat2_p[5] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+ 4);
+ flat2_q[5] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+ 4);
+ }
+ // highbd_filter8
+ int i;
+ for (i = 0; i < 2; i++) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
+ p[2] = _mm_andnot_si128(flat, p[2]);
+ // p2 remains unchanged if !(flat && mask)
+ flat_p[2] = _mm_and_si128(flat, flat_p[2]);
+ // when (flat && mask)
+ p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
+ q[2] = _mm_andnot_si128(flat, q[2]);
+ flat_q[2] = _mm_and_si128(flat, flat_q[2]);
+ q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
+ for (i = 0; i < 2; i++) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
+ // highbd_filter16
+ if (flat2_mask) {
+ for (i = 0; i < 6; i++) {
+ // p[i] remains unchanged if !(flat2 && flat && mask)
+ p[i] = _mm_andnot_si128(flat2, p[i]);
+ flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+ // get values for when (flat2 && flat && mask)
+ p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
+ q[i] = _mm_andnot_si128(flat2, q[i]);
+ flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+ q[i] = _mm_or_si128(q[i], flat2_q[i]);
+ }
+ }
+ } else {
+ p[0] = ps[0];
+ q[0] = qs[0];
+ p[1] = ps[1];
+ q[1] = qs[1];
@@ -696,6 +756,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
&thresh, &hev, &mask);
+ // lp filter
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
// flat_mask
flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
@@ -707,53 +770,56 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
// replicate for the further "merged variables" usage
flat = _mm_unpacklo_epi64(flat, flat);
- {
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_c;
+ __m128i pq0x2_pq1, pq1_pq2;
// op1
- workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
- _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
- *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4
+ pq0x2_pq1 =
+ _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1
+ pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+ pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
- workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
- workp_shft0 = _mm_add_epi16(
- workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+ workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
+ workp_b =
+ _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
// op0
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1
- workp_a =
- _mm_add_epi16(workp_a,
- workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
- flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3);
+ workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+ flat_p1p0 = _mm_srli_epi16(workp_b, 3);
// oq0
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
- *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4
- workp_b = _mm_add_epi16(*q1, *q2);
- workp_shft0 = _mm_add_epi16(
- workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
+ pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_srli_si128(pq1_pq2, 8);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ // workp_shft0 = _mm_srli_epi16(workp_a, 3);
// oq1
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
- *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4
+ workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
+ pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4
workp_b = _mm_add_epi16(*q2, *q2);
- workp_shft1 = _mm_add_epi16(
- workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+ workp_b =
+ _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
- flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
- }
- // lp filter
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
+ workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+ flat_q0q1 = _mm_srli_epi16(workp_a, 3);
- qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
- ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
@@ -797,6 +863,17 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
mask = _mm_subs_epu16(mask, limit0);
mask = _mm_cmpeq_epi16(mask, zero);
+ // lp filter
+ __m128i ps[2], qs[2], p[2], q[2];
+ {
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+ // filter_mask and hev_mask
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ }
// flat_mask
flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
flat = _mm_max_epi16(flat, work);
@@ -806,7 +883,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
flat = _mm_cmpeq_epi16(flat, zero);
flat = _mm_and_si128(flat, mask); // flat & mask
- {
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
__m128i workp_a, workp_b, workp_shft0, workp_shft1;
// op1
@@ -842,33 +921,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
workp_shft1 = _mm_add_epi16(
workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
oq1 = _mm_srli_epi16(workp_shft1, 3);
- }
- // lp filter
- __m128i ps[2], qs[2], p[2], q[2];
- {
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
- // filter_mask and hev_mask
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
- }
- qs[0] = _mm_andnot_si128(flat, qs[0]);
- oq0 = _mm_and_si128(flat, oq0);
- *q0 = _mm_or_si128(qs[0], oq0);
- qs[1] = _mm_andnot_si128(flat, qs[1]);
- oq1 = _mm_and_si128(flat, oq1);
- *q1 = _mm_or_si128(qs[1], oq1);
- ps[0] = _mm_andnot_si128(flat, ps[0]);
- op0 = _mm_and_si128(flat, op0);
- *p0 = _mm_or_si128(ps[0], op0);
- ps[1] = _mm_andnot_si128(flat, ps[1]);
- op1 = _mm_and_si128(flat, op1);
- *p1 = _mm_or_si128(ps[1], op1);
+ qs[0] = _mm_andnot_si128(flat, qs[0]);
+ oq0 = _mm_and_si128(flat, oq0);
+ *q0 = _mm_or_si128(qs[0], oq0);
+ qs[1] = _mm_andnot_si128(flat, qs[1]);
+ oq1 = _mm_and_si128(flat, oq1);
+ *q1 = _mm_or_si128(qs[1], oq1);
+ ps[0] = _mm_andnot_si128(flat, ps[0]);
+ op0 = _mm_and_si128(flat, op0);
+ *p0 = _mm_or_si128(ps[0], op0);
+ ps[1] = _mm_andnot_si128(flat, ps[1]);
+ op1 = _mm_and_si128(flat, op1);
+ *p1 = _mm_or_si128(ps[1], op1);
+ } else {
+ *q0 = qs[0];
+ *q1 = qs[1];
+ *p0 = ps[0];
+ *p1 = ps[1];
+ }
void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
@@ -926,7 +1000,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
__m128i mask, hev, flat;
__m128i pq[4];
__m128i p1p0, q1q0, ps1ps0, qs1qs0;
- __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1;
+ __m128i work_a, opq2, flat_p1p0, flat_q0q1;
pq[0] = _mm_unpacklo_epi64(*p0, *q0);
pq[1] = _mm_unpacklo_epi64(*p1, *q1);
@@ -944,6 +1018,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
&thresh, &hev, &mask);
+ // lp filter
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
// flat_mask4
flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
flat = _mm_max_epi16(abs_p1p0, flat);
@@ -956,15 +1033,15 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
// replicate for the further "merged variables" usage
flat = _mm_unpacklo_epi64(flat, flat);
- {
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
// Added before shift for rounding part of ROUND_POWER_OF_TWO
// o*p2
workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
- op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+ workp_c = _mm_add_epi16(workp_a, workp_c);
// o*p1
workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
@@ -992,27 +1069,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
// oq2
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
- oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- }
+ workp_a = _mm_add_epi16(workp_a, workp_b);
+ opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
- // lp filter
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
- qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
- ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
- work_a = _mm_andnot_si128(flat, *q2);
- *q2 = _mm_and_si128(flat, oq2);
- *q2 = _mm_or_si128(work_a, *q2);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
- work_a = _mm_andnot_si128(flat, *p2);
- *p2 = _mm_and_si128(flat, op2);
- *p2 = _mm_or_si128(work_a, *p2);
+ work_a = _mm_andnot_si128(flat, pq[2]);
+ *p2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_a, *p2);
+ *q2 = _mm_srli_si128(*p2, 8);
+ }
static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
@@ -1058,17 +1130,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
mask = _mm_subs_epu16(mask, limit0);
mask = _mm_cmpeq_epi16(mask, zero);
+ // lp filter
+ __m128i ps[2], qs[2], p[2], q[2];
+ {
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+ // filter_mask and hev_mask
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ }
flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
flat = _mm_max_epi16(work1, flat);
work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
flat = _mm_max_epi16(work0, flat);
flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
flat = _mm_cmpeq_epi16(flat, zero);
flat = _mm_and_si128(flat, mask); // flat & mask
- {
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
__m128i workp_a, workp_b;
// Added before shift for rounding part of ROUND_POWER_OF_TWO
@@ -1101,42 +1184,36 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- }
- // lp filter
- __m128i ps[2], qs[2], p[2], q[2];
- {
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
- // filter_mask and hev_mask
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ qs[0] = _mm_andnot_si128(flat, qs[0]);
+ oq0 = _mm_and_si128(flat, oq0);
+ *q0 = _mm_or_si128(qs[0], oq0);
+ qs[1] = _mm_andnot_si128(flat, qs[1]);
+ oq1 = _mm_and_si128(flat, oq1);
+ *q1 = _mm_or_si128(qs[1], oq1);
+ ps[0] = _mm_andnot_si128(flat, ps[0]);
+ op0 = _mm_and_si128(flat, op0);
+ *p0 = _mm_or_si128(ps[0], op0);
+ ps[1] = _mm_andnot_si128(flat, ps[1]);
+ op1 = _mm_and_si128(flat, op1);
+ *p1 = _mm_or_si128(ps[1], op1);
+ work_a = _mm_andnot_si128(flat, *q2);
+ *q2 = _mm_and_si128(flat, oq2);
+ *q2 = _mm_or_si128(work_a, *q2);
+ work_a = _mm_andnot_si128(flat, *p2);
+ *p2 = _mm_and_si128(flat, op2);
+ *p2 = _mm_or_si128(work_a, *p2);
+ } else {
+ *q0 = qs[0];
+ *q1 = qs[1];
+ *p0 = ps[0];
+ *p1 = ps[1];
- qs[0] = _mm_andnot_si128(flat, qs[0]);
- oq0 = _mm_and_si128(flat, oq0);
- *q0 = _mm_or_si128(qs[0], oq0);
- qs[1] = _mm_andnot_si128(flat, qs[1]);
- oq1 = _mm_and_si128(flat, oq1);
- *q1 = _mm_or_si128(qs[1], oq1);
- ps[0] = _mm_andnot_si128(flat, ps[0]);
- op0 = _mm_and_si128(flat, op0);
- *p0 = _mm_or_si128(ps[0], op0);
- ps[1] = _mm_andnot_si128(flat, ps[1]);
- op1 = _mm_and_si128(flat, op1);
- *p1 = _mm_or_si128(ps[1], op1);
- work_a = _mm_andnot_si128(flat, *q2);
- *q2 = _mm_and_si128(flat, oq2);
- *q2 = _mm_or_si128(work_a, *q2);
- work_a = _mm_andnot_si128(flat, *p2);
- *p2 = _mm_and_si128(flat, op2);
- *p2 = _mm_or_si128(work_a, *p2);
void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
index dea113a29..b9689202a 100644
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -110,7 +110,7 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
+ const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
@@ -120,12 +120,23 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const unsigned int step = 8;
- if (LIKELY(!skip_block)) {
- __m256i qp[5], coeff;
- init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
- coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ __m256i qp[5], coeff;
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ __m256i eob = _mm256_setzero_si256();
+ quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ update_qp(qp);
- __m256i eob = _mm256_setzero_si256();
+ while (n_coeffs > 0) {
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
coeff_ptr += step;
@@ -133,40 +144,17 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dqcoeff_ptr += step;
iscan += step;
n_coeffs -= step;
- update_qp(qp);
- while (n_coeffs > 0) {
- coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
- quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
- coeff_ptr += step;
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- iscan += step;
- n_coeffs -= step;
- }
- {
- __m256i eob_s;
- eob_s = _mm256_shuffle_epi32(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 1);
- eob = _mm256_max_epi16(eob, eob_s);
- const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
- _mm256_extractf128_si256(eob, 1));
- *eob_ptr = _mm_extract_epi16(final_eob, 0);
- }
- } else {
- do {
- const __m256i zero = _mm256_setzero_si256();
- _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero);
- _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero);
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- n_coeffs -= step;
- } while (n_coeffs > 0);
- *eob_ptr = 0;
+ }
+ {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+ _mm256_extractf128_si256(eob, 1));
+ *eob_ptr = _mm_extract_epi16(final_eob, 0);
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 5570ca5b7..58e5f98e5 100644
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -16,7 +16,7 @@
#include "aom_ports/mem.h"
void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
- int skip_block, const int16_t *zbin_ptr,
+ const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
@@ -41,50 +41,48 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = ((int)count / 4) - 1; i >= 0; i--) {
- __m128i coeffs, cmp1, cmp2;
- int test;
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
- cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
- cmp1 = _mm_and_si128(cmp1, cmp2);
- test = _mm_movemask_epi8(cmp1);
- if (test == 0xffff)
- non_zero_regs--;
- else
- break;
- }
+ // Pre-scan pass
+ for (i = ((int)count / 4) - 1; i >= 0; i--) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (test == 0xffff)
+ non_zero_regs--;
+ else
+ break;
+ }
- // Quantization pass:
- for (i = 0; i < non_zero_regs; i++) {
- __m128i coeffs, coeffs_sign, tmp1, tmp2;
- int test;
- int abs_coeff[4];
- int coeff_sign[4];
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- coeffs_sign = _mm_srai_epi32(coeffs, 31);
- coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
- tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
- tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
- tmp1 = _mm_or_si128(tmp1, tmp2);
- test = _mm_movemask_epi8(tmp1);
- _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
- _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
- for (j = 0; j < 4; j++) {
- if (test & (1 << (4 * j))) {
- int k = 4 * i + j;
- const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
- const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
- const uint32_t abs_qcoeff =
- (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
- qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
- dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
- if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
- }
+ // Quantization pass:
+ for (i = 0; i < non_zero_regs; i++) {
+ __m128i coeffs, coeffs_sign, tmp1, tmp2;
+ int test;
+ int abs_coeff[4];
+ int coeff_sign[4];
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ coeffs_sign = _mm_srai_epi32(coeffs, 31);
+ coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ test = _mm_movemask_epi8(tmp1);
+ _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+ _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+ for (j = 0; j < 4; j++) {
+ if (test & (1 << (4 * j))) {
+ int k = 4 * i + j;
+ const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+ const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+ qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+ if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
@@ -92,8 +90,8 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
void aom_highbd_quantize_b_32x32_sse2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
@@ -116,38 +114,35 @@ void aom_highbd_quantize_b_32x32_sse2(
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = 0; i < n_coeffs / 4; i++) {
- __m128i coeffs, cmp1, cmp2;
- int test;
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
- cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
- cmp1 = _mm_and_si128(cmp1, cmp2);
- test = _mm_movemask_epi8(cmp1);
- if (!(test & 0xf)) idx_arr[idx++] = i * 4;
- if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
- if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
- if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
- }
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
- // Quantization pass: only process the coefficients selected in
- // pre-scan pass. Note: idx can be zero.
- for (i = 0; i < idx; i++) {
- const int rc = idx_arr[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp1 =
- abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
- const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
- const uint32_t abs_qcoeff =
- (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
- qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
- if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
- }
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
*eob_ptr = eob + 1;
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
new file mode 100644
index 000000000..9b1b4c9de
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
@@ -0,0 +1,140 @@
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include <assert.h>
+#include <immintrin.h> // AVX2
+#include "config/aom_dsp_rtcd.h"
+typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum) {
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+ for (int i = 0; i < 8; i += 2) {
+ const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
+ const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
+ const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
+ __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
+ __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
+ v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
+ v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
+ const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+ const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+ v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+ src += src_stride * 2;
+ ref += ref_stride * 2;
+ }
+ __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
+ __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
+ __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ *sum = _mm_extract_epi32(v_d, 0);
+ *sse = _mm_extract_epi32(v_d, 1);
+void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum) {
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ for (int i = 0; i < 16; ++i) {
+ const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+ const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+ v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+ src += src_stride;
+ ref += ref_stride;
+ }
+ __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ *sum = _mm_extract_epi32(v_d, 0);
+ *sse = _mm_extract_epi32(v_d, 1);
+static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t aom_highbd_10_variance##w##x##h##_avx2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_avx2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+VAR_FN(128, 128, 16, 14);
+VAR_FN(128, 64, 16, 13);
+VAR_FN(64, 128, 16, 13);
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+VAR_FN(16, 4, 16, 6);
+VAR_FN(8, 32, 8, 8);
+VAR_FN(32, 8, 8, 8);
+VAR_FN(16, 64, 16, 10);
+VAR_FN(64, 16, 16, 10);
+#undef VAR_FN
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index 131c16aa9..47b052abc 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -179,6 +179,9 @@ HIGH_GET_VAR(8);
return (var >= 0) ? (uint32_t)var : 0; \
+VAR_FN(128, 128, 16, 14);
+VAR_FN(128, 64, 16, 13);
+VAR_FN(64, 128, 16, 13);
VAR_FN(64, 64, 16, 12);
VAR_FN(64, 32, 16, 11);
VAR_FN(32, 64, 16, 11);
@@ -590,10 +593,10 @@ FNS(sse2);
void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
const struct AV1Common *const cm,
int mi_row, int mi_col, const MV *const mv,
- uint16_t *comp_pred, int width, int height,
+ uint8_t *comp_pred8, int width, int height,
int subpel_x_q3, int subpel_y_q3,
- const uint8_t *ref8, int ref_stride,
- int bd) {
+ const uint8_t *ref8, int ref_stride, int bd,
+ int subpel_search) {
// expect xd == NULL only in tests
if (xd != NULL) {
const MB_MODE_INFO *mi = xd->mi[0];
@@ -606,8 +609,6 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
if (is_scaled) {
// Note: This is mostly a copy from the >=8X8 case in
// build_inter_predictors() function, with some small tweaks.
- uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);
// Some assumptions.
const int plane = 0;
@@ -661,7 +662,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
// Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
const InterpFilters filters =
@@ -677,10 +678,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
const InterpFilterParams *filter =
- av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ (subpel_search == 1)
+ ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+ : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
if (!subpel_x_q3 && !subpel_y_q3) {
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
if (width >= 8) {
int i;
assert(!(width & 7));
@@ -711,13 +715,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
} else if (!subpel_y_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
- width, kernel, 16, NULL, -1, width, height, bd);
+ aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+ NULL, -1, width, height, bd);
} else if (!subpel_x_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
- width, NULL, -1, kernel, 16, width, height, bd);
+ aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+ kernel, 16, width, height, bd);
} else {
DECLARE_ALIGNED(16, uint16_t,
temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
@@ -734,30 +738,29 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
intermediate_height, bd);
CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
- MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
- 16, width, height, bd);
+ MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+ bd);
void aom_highbd_comp_avg_upsampled_pred_sse2(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd) {
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- int n;
- int i;
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+ int ref_stride, int bd, int subpel_search) {
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd);
+ bd, subpel_search);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
/*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
assert(!(width * height & 7));
- n = width * height >> 3;
- for (i = 0; i < n; i++) {
- __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred);
+ int n = width * height >> 3;
+ for (int i = 0; i < n; i++) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
__m128i p0 = _mm_loadu_si128((const __m128i *)pred);
- _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu16(s0, p0));
- comp_pred += 8;
+ _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
+ comp_pred16 += 8;
pred += 8;
@@ -777,7 +780,7 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
xx_storeu_128(result, shift);
-void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
+void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
const uint8_t *pred8, int width,
int height, const uint8_t *ref8,
int ref_stride,
@@ -792,6 +795,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
_mm_set_epi16(round, round, round, round, round, round, round, round);
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
if (width >= 8) {
// Read 8 pixels one row at a time
@@ -830,15 +834,16 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
+ int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+ int subpel_search) {
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
int n;
int i;
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd);
+ bd, subpel_search);
assert(!(width * height & 7));
n = width * height >> 3;
@@ -850,13 +855,14 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
const __m128i r =
_mm_set_epi16(round, round, round, round, round, round, round, round);
+ uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
for (i = 0; i < n; i++) {
- __m128i p0 = xx_loadu_128(comp_pred);
+ __m128i p0 = xx_loadu_128(comp_pred16);
__m128i p1 = xx_loadu_128(pred);
- highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+ highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
- comp_pred += 8;
+ comp_pred16 += 8;
pred += 8;
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
index 6c247a91b..df5449a9d 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -168,8 +168,8 @@ uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
- aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
- 4);
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
@@ -188,8 +188,8 @@ uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
- aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
- 4);
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
dst_stride, sse);
@@ -208,8 +208,8 @@ uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
- aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
- 4);
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
dst_stride, sse);
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
index eaf1f347b..f9a41a210 100644
--- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -120,11 +120,11 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+ int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
int n;
int i;
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride);
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
/*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
assert(!(width * height & 15));
n = width * height >> 4;
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
deleted file mode 100644
index 18862dd3e..000000000
--- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
+++ /dev/null
@@ -1,916 +0,0 @@
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at
- */
-#include <immintrin.h> /* AVX2 */
-#include "config/aom_dsp_rtcd.h"
-#include "aom_ports/mem.h"
-void aom_lpf_horizontal_16_avx2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
- __m128i abs_p1p0;
- const __m128i thresh =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
- const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
- const __m128i blimit =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
- q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
- q4p4 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
- q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
- q3p3 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
- q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- q2p2 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
- q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- q1p1 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
- p1q1 = _mm_shuffle_epi32(q1p1, 78);
- q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0p0 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
- p0q0 = _mm_shuffle_epi32(q0p0, 78);
- {
- __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
- abs_p1p0 =
- _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
- fe = _mm_set1_epi8(0xfe);
- ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- abs_p0q0 =
- _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
- abs_p1q1 =
- _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
- _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i t1 = _mm_set1_epi16(0x1);
- __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
- __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
- __m128i qs0 = _mm_xor_si128(p0q0, t80);
- __m128i qs1 = _mm_xor_si128(p1q1, t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
- __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
- __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
- filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, qs0ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- /* (aom_filter + 3 * (qs0 - ps0)) & mask */
- filt = _mm_and_si128(filt, mask);
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
- filter1 = _mm_unpacklo_epi8(zero, filter1);
- filter1 = _mm_srai_epi16(filter1, 0xB);
- filter2 = _mm_unpacklo_epi8(zero, filter2);
- filter2 = _mm_srai_epi16(filter2, 0xB);
- /* Filter1 >> 3 */
- filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
- qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
- /* filt >> 1 */
- filt = _mm_adds_epi16(filter1, t1);
- filt = _mm_srai_epi16(filt, 1);
- filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
- filt);
- filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
- qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
- // loopfilter done
- {
- __m128i work;
- flat = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
- _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
- flat = _mm_max_epu8(abs_p1p0, flat);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
- q5p5 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
- q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
- q6p6 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
- flat2 = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
- _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
- q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
- q7p7 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
- _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
- __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
- __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
- __m128i pixelFilter_p, pixelFilter_q;
- __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
- __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
- p7_16 = _mm_unpacklo_epi8(q7p7, zero);
- p6_16 = _mm_unpacklo_epi8(q6p6, zero);
- p5_16 = _mm_unpacklo_epi8(q5p5, zero);
- p4_16 = _mm_unpacklo_epi8(q4p4, zero);
- p3_16 = _mm_unpacklo_epi8(q3p3, zero);
- p2_16 = _mm_unpacklo_epi8(q2p2, zero);
- p1_16 = _mm_unpacklo_epi8(q1p1, zero);
- p0_16 = _mm_unpacklo_epi8(q0p0, zero);
- q0_16 = _mm_unpackhi_epi8(q0p0, zero);
- q1_16 = _mm_unpackhi_epi8(q1p1, zero);
- q2_16 = _mm_unpackhi_epi8(q2p2, zero);
- q3_16 = _mm_unpackhi_epi8(q3p3, zero);
- q4_16 = _mm_unpackhi_epi8(q4p4, zero);
- q5_16 = _mm_unpackhi_epi8(q5p5, zero);
- q6_16 = _mm_unpackhi_epi8(q6p6, zero);
- q7_16 = _mm_unpackhi_epi8(q7p7, zero);
- pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
- _mm_add_epi16(p4_16, p3_16));
- pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
- _mm_add_epi16(q4_16, q3_16));
- pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
- pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
- pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
- pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- pixelFilter_p =
- _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
- pixetFilter_p2p1p0 = _mm_add_epi16(
- four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
- flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
- flat_q0p0 = _mm_packus_epi16(res_p, res_q);
- sum_p7 = _mm_add_epi16(p7_16, p7_16);
- sum_q7 = _mm_add_epi16(q7_16, q7_16);
- sum_p3 = _mm_add_epi16(p3_16, p3_16);
- sum_q3 = _mm_add_epi16(q3_16, q3_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
- flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
- flat_q1p1 = _mm_packus_epi16(res_p, res_q);
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- sum_p3 = _mm_add_epi16(sum_p3, p3_16);
- sum_q3 = _mm_add_epi16(sum_q3, q3_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
- flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
- flat_q2p2 = _mm_packus_epi16(res_p, res_q);
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
- flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
- flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
- flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
- flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
- }
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- flat = _mm_shuffle_epi32(flat, 68);
- flat2 = _mm_shuffle_epi32(flat2, 68);
- q2p2 = _mm_andnot_si128(flat, q2p2);
- flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
- q2p2 = _mm_or_si128(q2p2, flat_q2p2);
- qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
- flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
- q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
- qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
- flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
- q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
- q6p6 = _mm_andnot_si128(flat2, q6p6);
- flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
- q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
- _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
- _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
- q5p5 = _mm_andnot_si128(flat2, q5p5);
- flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
- q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
- _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
- _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
- q4p4 = _mm_andnot_si128(flat2, q4p4);
- flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
- q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
- _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
- _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
- q3p3 = _mm_andnot_si128(flat2, q3p3);
- flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
- q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
- _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
- _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
- q2p2 = _mm_andnot_si128(flat2, q2p2);
- flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
- q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
- _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
- _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
- q1p1 = _mm_andnot_si128(flat2, q1p1);
- flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
- q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
- _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
- _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
- q0p0 = _mm_andnot_si128(flat2, q0p0);
- flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
- q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
- _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
- _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
- }
-DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
- 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
- 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
-void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- __m128i p7, p6, p5;
- __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
- __m128i q5, q6, q7;
- __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
- p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
- const __m128i thresh =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
- const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
- const __m128i blimit =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
- p256_4 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
- p256_3 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
- p256_2 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
- p256_1 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
- p256_0 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
- q256_0 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
- q256_1 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
- q256_2 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
- q256_3 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
- q256_4 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
- p4 = _mm256_castsi256_si128(p256_4);
- p3 = _mm256_castsi256_si128(p256_3);
- p2 = _mm256_castsi256_si128(p256_2);
- p1 = _mm256_castsi256_si128(p256_1);
- p0 = _mm256_castsi256_si128(p256_0);
- q0 = _mm256_castsi256_si128(q256_0);
- q1 = _mm256_castsi256_si128(q256_1);
- q2 = _mm256_castsi256_si128(q256_2);
- q3 = _mm256_castsi256_si128(q256_3);
- q4 = _mm256_castsi256_si128(q256_4);
- {
- const __m128i abs_p1p0 =
- _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 =
- _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 =
- _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 =
- _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
- __m128i work;
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
- __m128i ps1 = _mm_xor_si128(p1, t80);
- __m128i ps0 = _mm_xor_si128(p0, t80);
- __m128i qs0 = _mm_xor_si128(q0, t80);
- __m128i qs1 = _mm_xor_si128(q1, t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
- __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
- flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
- flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- /* (aom_filter + 3 * (qs0 - ps0)) & mask */
- filt = _mm_and_si128(filt, mask);
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
- /* Filter1 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
- qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
- /* Filter2 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
- ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
- /* filt >> 1 */
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
- filt = _mm_andnot_si128(hev, filt);
- ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
- // loopfilter done
- {
- __m128i work;
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
- _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- p256_5 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
- q256_5 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
- p5 = _mm256_castsi256_si128(p256_5);
- q5 = _mm256_castsi256_si128(q256_5);
- flat2 = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
- _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
- flat2 = _mm_max_epu8(work, flat2);
- p256_6 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
- q256_6 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
- p6 = _mm256_castsi256_si128(p256_6);
- q6 = _mm256_castsi256_si128(q256_6);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
- _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
- flat2 = _mm_max_epu8(work, flat2);
- p256_7 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
- q256_7 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
- p7 = _mm256_castsi256_si128(p256_7);
- q7 = _mm256_castsi256_si128(q256_7);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
- _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
- const __m256i eight = _mm256_set1_epi16(8);
- const __m256i four = _mm256_set1_epi16(4);
- __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
- pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
- const __m256i filter =
- _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
- p256_7 = _mm256_shuffle_epi8(p256_7, filter);
- p256_6 = _mm256_shuffle_epi8(p256_6, filter);
- p256_5 = _mm256_shuffle_epi8(p256_5, filter);
- p256_4 = _mm256_shuffle_epi8(p256_4, filter);
- p256_3 = _mm256_shuffle_epi8(p256_3, filter);
- p256_2 = _mm256_shuffle_epi8(p256_2, filter);
- p256_1 = _mm256_shuffle_epi8(p256_1, filter);
- p256_0 = _mm256_shuffle_epi8(p256_0, filter);
- q256_0 = _mm256_shuffle_epi8(q256_0, filter);
- q256_1 = _mm256_shuffle_epi8(q256_1, filter);
- q256_2 = _mm256_shuffle_epi8(q256_2, filter);
- q256_3 = _mm256_shuffle_epi8(q256_3, filter);
- q256_4 = _mm256_shuffle_epi8(q256_4, filter);
- q256_5 = _mm256_shuffle_epi8(q256_5, filter);
- q256_6 = _mm256_shuffle_epi8(q256_6, filter);
- q256_7 = _mm256_shuffle_epi8(q256_7, filter);
- pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
- _mm256_add_epi16(p256_4, p256_3));
- pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
- _mm256_add_epi16(q256_4, q256_3));
- pixetFilter_p2p1p0 =
- _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
- pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
- pixetFilter_q2q1q0 =
- _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
- pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- pixelFilter_p = _mm256_add_epi16(
- eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
- pixetFilter_p2p1p0 = _mm256_add_epi16(
- four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
- flat2_p0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
- flat2_q0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- res_p =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(p256_3, p256_0)),
- 3);
- flat_p0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- res_q =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(q256_3, q256_0)),
- 3);
- flat_q0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- sum_p7 = _mm256_add_epi16(p256_7, p256_7);
- sum_q7 = _mm256_add_epi16(q256_7, q256_7);
- sum_p3 = _mm256_add_epi16(p256_3, p256_3);
- sum_q3 = _mm256_add_epi16(q256_3, q256_3);
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
- flat2_p1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
- flat2_q1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
- pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
- res_p =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(sum_p3, p256_1)),
- 3);
- flat_p1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- res_q =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
- _mm256_add_epi16(sum_q3, q256_1)),
- 3);
- flat_q1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
- sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
- sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
- flat2_p2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
- flat2_q2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
- pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
- res_p =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(sum_p3, p256_2)),
- 3);
- flat_p2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- res_q =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
- _mm256_add_epi16(sum_q3, q256_2)),
- 3);
- flat_q2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
- flat2_p3 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
- flat2_q3 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
- flat2_p4 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
- flat2_q4 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
- flat2_p5 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
- flat2_q5 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
- flat2_p6 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
- flat2_q6 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- }
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- p2 = _mm_andnot_si128(flat, p2);
- flat_p2 = _mm_and_si128(flat, flat_p2);
- p2 = _mm_or_si128(flat_p2, p2);
- p1 = _mm_andnot_si128(flat, ps1);
- flat_p1 = _mm_and_si128(flat, flat_p1);
- p1 = _mm_or_si128(flat_p1, p1);
- p0 = _mm_andnot_si128(flat, ps0);
- flat_p0 = _mm_and_si128(flat, flat_p0);
- p0 = _mm_or_si128(flat_p0, p0);
- q0 = _mm_andnot_si128(flat, qs0);
- flat_q0 = _mm_and_si128(flat, flat_q0);
- q0 = _mm_or_si128(flat_q0, q0);
- q1 = _mm_andnot_si128(flat, qs1);
- flat_q1 = _mm_and_si128(flat, flat_q1);
- q1 = _mm_or_si128(flat_q1, q1);
- q2 = _mm_andnot_si128(flat, q2);
- flat_q2 = _mm_and_si128(flat, flat_q2);
- q2 = _mm_or_si128(flat_q2, q2);
- p6 = _mm_andnot_si128(flat2, p6);
- flat2_p6 = _mm_and_si128(flat2, flat2_p6);
- p6 = _mm_or_si128(flat2_p6, p6);
- _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
- p5 = _mm_andnot_si128(flat2, p5);
- flat2_p5 = _mm_and_si128(flat2, flat2_p5);
- p5 = _mm_or_si128(flat2_p5, p5);
- _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
- p4 = _mm_andnot_si128(flat2, p4);
- flat2_p4 = _mm_and_si128(flat2, flat2_p4);
- p4 = _mm_or_si128(flat2_p4, p4);
- _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
- p3 = _mm_andnot_si128(flat2, p3);
- flat2_p3 = _mm_and_si128(flat2, flat2_p3);
- p3 = _mm_or_si128(flat2_p3, p3);
- _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
- p2 = _mm_andnot_si128(flat2, p2);
- flat2_p2 = _mm_and_si128(flat2, flat2_p2);
- p2 = _mm_or_si128(flat2_p2, p2);
- _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
- p1 = _mm_andnot_si128(flat2, p1);
- flat2_p1 = _mm_and_si128(flat2, flat2_p1);
- p1 = _mm_or_si128(flat2_p1, p1);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- p0 = _mm_andnot_si128(flat2, p0);
- flat2_p0 = _mm_and_si128(flat2, flat2_p0);
- p0 = _mm_or_si128(flat2_p0, p0);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- q0 = _mm_andnot_si128(flat2, q0);
- flat2_q0 = _mm_and_si128(flat2, flat2_q0);
- q0 = _mm_or_si128(flat2_q0, q0);
- _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
- q1 = _mm_andnot_si128(flat2, q1);
- flat2_q1 = _mm_and_si128(flat2, flat2_q1);
- q1 = _mm_or_si128(flat2_q1, q1);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
- q2 = _mm_andnot_si128(flat2, q2);
- flat2_q2 = _mm_and_si128(flat2, flat2_q2);
- q2 = _mm_or_si128(flat2_q2, q2);
- _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
- q3 = _mm_andnot_si128(flat2, q3);
- flat2_q3 = _mm_and_si128(flat2, flat2_q3);
- q3 = _mm_or_si128(flat2_q3, q3);
- _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
- q4 = _mm_andnot_si128(flat2, q4);
- flat2_q4 = _mm_and_si128(flat2, flat2_q4);
- q4 = _mm_or_si128(flat2_q4, q4);
- _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
- q5 = _mm_andnot_si128(flat2, q5);
- flat2_q5 = _mm_and_si128(flat2, flat2_q5);
- q5 = _mm_or_si128(flat2_q5, q5);
- _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
- q6 = _mm_andnot_si128(flat2, q6);
- flat2_q6 = _mm_and_si128(flat2, flat2_q6);
- q6 = _mm_or_si128(flat2_q6, q6);
- _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
- }
- _mm256_zeroupper();
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
index f1eac233b..9d88b5e49 100644
--- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -249,6 +249,63 @@ static INLINE void transpose16x8_8x16_sse2(
*d7 = _mm_unpackhi_epi64(w7, w15);
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them independently while flipping the second matrix horizontaly Used for 14
+// taps filter pq pairs inverse
+static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7,
+ __m128i *pq0, __m128i *pq1,
+ __m128i *pq2, __m128i *pq3) {
+ __m128i w10, w11, w12, w13;
+ __m128i w0, w1, w2, w3, w4, w5;
+ __m128i d0, d1, d2, d3;
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ d0 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ d2 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ w10 = _mm_unpacklo_epi8(
+ *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
+ w11 = _mm_unpacklo_epi8(
+ *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
+ w12 = _mm_unpacklo_epi8(
+ *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
+ w13 = _mm_unpacklo_epi8(
+ *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
+ w4 = _mm_unpackhi_epi16(
+ w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpackhi_epi16(
+ w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ d1 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ d3 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ *pq0 = _mm_unpacklo_epi64(d0, d1); // pq
+ *pq1 = _mm_unpackhi_epi64(d0, d1); // pq
+ *pq2 = _mm_unpacklo_epi64(d2, d3); // pq
+ *pq3 = _mm_unpackhi_epi64(d2, d3); // pq
static INLINE void transpose8x16_16x8_sse2(
__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
__m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
@@ -300,9 +357,120 @@ static INLINE void transpose8x16_16x8_sse2(
*d14d15 = _mm_unpackhi_epi64(w7, w15);
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8 independently while flipping the second matrix horizontaly. Used
+// for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *q0p0,
+ __m128i *q1p1, __m128i *q2p2,
+ __m128i *q3p3, __m128i *q4p4,
+ __m128i *q5p5, __m128i *q6p6,
+ __m128i *q7p7) {
+ __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ w2 = _mm_unpackhi_epi8(
+ *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+ w3 = _mm_unpackhi_epi8(
+ *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
+ ww0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ww1 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ww2 = _mm_unpacklo_epi16(
+ w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311
+ ww3 = _mm_unpackhi_epi16(
+ w2,
+ w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315
+ *q7p7 = _mm_unpacklo_epi32(
+ ww0,
+ _mm_srli_si128(
+ ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx
+ *q6p6 = _mm_unpackhi_epi32(
+ _mm_slli_si128(ww0, 4),
+ ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx
+ *q5p5 = _mm_unpackhi_epi32(
+ ww0,
+ _mm_slli_si128(
+ ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx
+ *q4p4 = _mm_unpacklo_epi32(
+ _mm_srli_si128(ww0, 12),
+ ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx
+ *q3p3 = _mm_unpacklo_epi32(
+ ww1,
+ _mm_srli_si128(
+ ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx
+ *q2p2 = _mm_unpackhi_epi32(
+ _mm_slli_si128(ww1, 4),
+ ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx
+ *q1p1 = _mm_unpackhi_epi32(
+ ww1,
+ _mm_slli_si128(
+ ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ *q0p0 = _mm_unpacklo_epi32(
+ _mm_srli_si128(ww1, 12),
+ ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
__m128i *hev, __m128i *mask,
__m128i *qs1qs0, __m128i *ps1ps0) {
+ __m128i filter, filter2filter1, work;
+ __m128i ps1ps0_work, qs1qs0_work;
+ __m128i hev1;
+ const __m128i t3t4 =
+ _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+ ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+ qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+ /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+ work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+ filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
+ /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */
+ filter = _mm_and_si128(filter, *mask); /* & mask */
+ filter = _mm_unpacklo_epi32(filter, filter);
+ /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+ /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+ filter2filter1 =
+ _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+ filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
+ /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+ filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit
+ filter = _mm_srai_epi16(filter, 9); /* round */
+ filter = _mm_packs_epi16(filter, filter);
+ filter = _mm_andnot_si128(*hev, filter);
+ filter = _mm_unpacklo_epi32(filter, filter);
+ filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
+ hev1 = _mm_srli_si128(filter2filter1, 8);
+ /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+ qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+ /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+ ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+ *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+ *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
+ __m128i *hev, __m128i *mask,
+ __m128i *qs1qs0,
+ __m128i *ps1ps0) {
const __m128i t3t4 =
_mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
const __m128i t80 = _mm_set1_epi8(0x80);
@@ -356,6 +524,49 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2(
__m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
__m128i q1p1, q0p0, p1p0, q1q0;
__m128i abs_p0q0, abs_p1q1;
+ __m128i mask, flat, hev;
+ const __m128i zero = _mm_setzero_si128();
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+ p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
+ q1q0 = _mm_srli_si128(p1p0, 8);
+ /* (abs(q1 - q0), abs(p1 - p0) */
+ flat = abs_diff(q1p1, q0p0);
+ /* abs(p1 - q1), abs(p0 - q0) */
+ __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ hev = _mm_unpacklo_epi8(flat, zero);
+ hev = _mm_cmpgt_epi16(hev, *thresh);
+ hev = _mm_packs_epi16(hev, hev);
+ hev = _mm_unpacklo_epi32(hev, hev);
+ abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+ abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */
+ abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+ mask = _mm_unpacklo_epi32(mask, flat);
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+ __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+ __m128i q1p1, q0p0, p1p0, q1q0;
+ __m128i abs_p0q0, abs_p1q1;
__m128i mask, hev;
const __m128i zero = _mm_setzero_si128();
@@ -390,14 +601,14 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2(
mask = _mm_cmpeq_epi8(mask, zero);
mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
const uint8_t *_blimit, const uint8_t *_limit,
const uint8_t *_thresh) {
const __m128i zero = _mm_setzero_si128();
- __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+ __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
_mm_loadl_epi64((const __m128i *)_limit));
__m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
@@ -413,9 +624,9 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
xx_storel_32(s - 1 * p, ps1ps0);
- xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8));
+ xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
xx_storel_32(s + 0 * p, qs1qs0);
- xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8));
+ xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
@@ -425,7 +636,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
__m128i p1, p0, q0, q1;
const __m128i zero = _mm_setzero_si128();
- __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+ __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
_mm_loadl_epi64((const __m128i *)_limit));
__m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
@@ -442,8 +653,8 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
// Transpose 8x4 to 4x8
- p1 = _mm_srli_si128(p1p0, 8);
- q1 = _mm_srli_si128(q1q0, 8);
+ p1 = _mm_srli_si128(p1p0, 4);
+ q1 = _mm_srli_si128(q1q0, 4);
transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
@@ -455,10 +666,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
xx_storel_32(s - (num + 1) * p, x);
- xx_storel_32(s + num * p, _mm_srli_si128(x, 8));
+ xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
-static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
__m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
__m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
__m128i *thresh) {
@@ -503,38 +714,31 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2(
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
mask = _mm_subs_epu8(mask, *limit);
mask = _mm_cmpeq_epi8(mask, zero);
- // replicate for the further "merged variables" usage
- mask = _mm_unpacklo_epi64(mask, mask);
// lp filter - the same for 6, 8 and 14 versions
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
// loopfilter done
__m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
__m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
- {
- __m128i work;
- flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
- flat = _mm_max_epu8(abs_p1p0, flat);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
- work = abs_diff(*q6p6, *q0p0);
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
+ __m128i work;
+ flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
__m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
@@ -619,137 +823,413 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2(
_mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
flat_q1p1 = _mm_packus_epi16(res_p, res_q);
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
- sum_p3 = _mm_add_epi16(sum_p3, p3_16);
- sum_q3 = _mm_add_epi16(sum_q3, q3_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
- 4);
- flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
res_p = _mm_srli_epi16(
_mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
res_q = _mm_srli_epi16(
_mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
flat_q2p2 = _mm_packus_epi16(res_p, res_q);
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
- 4);
- flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
- 4);
- flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ // work with flat2
+ flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+ work = abs_diff(*q6p6, *q0p0);
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
- 4);
- flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat = _mm_unpacklo_epi64(flat, flat);
+ *q2p2 = _mm_andnot_si128(flat, *q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+ 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+ 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+ 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+ 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat2 = _mm_unpacklo_epi64(flat2, flat2);
+ *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+ *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+ *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+ *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+ *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+ *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+ }
+ } else {
+ *q0p0 = qs0ps0;
+ *q1p1 = qs1ps1;
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- flat = _mm_shuffle_epi32(flat, 68);
- flat2 = _mm_shuffle_epi32(flat2, 68);
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+ __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+ __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i mask, hev, flat, flat2;
+ __m128i flat2_pq[6], flat_pq[3];
+ __m128i qs0ps0, qs1ps1;
+ __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+ __m128i abs_p1p0;
- *q2p2 = _mm_andnot_si128(flat, *q2p2);
- flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
- *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+ p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
+ q1q0 = _mm_srli_si128(p1p0, 8);
- qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
- flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
- *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+ __m128i fe, ff, work;
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+ abs_p1p0 = abs_diff(*q1p1, *q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(fe, fe);
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
- qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
- flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
- *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- *q5p5 = _mm_andnot_si128(flat2, *q5p5);
- flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
- *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
- *q4p4 = _mm_andnot_si128(flat2, *q4p4);
- flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
- *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
- *q3p3 = _mm_andnot_si128(flat2, *q3p3);
- flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
- *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+ work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
- *q2p2 = _mm_andnot_si128(flat2, *q2p2);
- flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
- *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
+ qs1ps1 = _mm_srli_si128(qs0ps0, 8);
+ // loopfilter done
- *q1p1 = _mm_andnot_si128(flat2, *q1p1);
- flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
- *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+ flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pq_16[7];
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i sum_p6;
+ __m128i sum_p3;
+ pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
+ pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
+ pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
+ pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
+ pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
+ pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
+ pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
+ q0_16 = _mm_srli_si128(pq_16[0], 8);
+ q1_16 = _mm_srli_si128(pq_16[1], 8);
+ q2_16 = _mm_srli_si128(pq_16[2], 8);
+ q3_16 = _mm_srli_si128(pq_16[3], 8);
+ q4_16 = _mm_srli_si128(pq_16[4], 8);
+ q5_16 = _mm_srli_si128(pq_16[5], 8);
+ __m128i flat_p[3], flat_q[3];
+ __m128i flat2_p[6], flat2_q[6];
+ __m128i work0, work0_0, work0_1, sum_p_0;
+ __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
+ __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
+ sum_p = _mm_add_epi16(sum_p, sum_lp);
+ __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+ __m128i sum_q = _mm_srli_si128(sum_p, 8);
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+ flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
+ flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
+ sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
+ sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
+ sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
+ sum_p = _mm_sub_epi16(sum_p_0, q5_16);
+ work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
+ work0_1 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
+ sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
+ sum_lp = _mm_sub_epi16(sum_lp, q2_16);
+ work0 = _mm_add_epi16(sum_p3, pq_16[1]);
+ flat_p[1] = _mm_add_epi16(sum_lp, work0);
+ flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+ flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+ flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+ flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
+ flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
+ sum_lp = _mm_sub_epi16(sum_lp, q1_16);
+ sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
+ sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
+ work0 = _mm_add_epi16(sum_p3, pq_16[2]);
+ flat_p[2] = _mm_add_epi16(sum_lp, work0);
+ flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+ flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+ flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
- *q0p0 = _mm_andnot_si128(flat2, *q0p0);
- flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
- *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+ work = abs_diff(*q6p6, *q0p0);
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ flat2 = _mm_unpacklo_epi32(flat2, flat2);
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
+ *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
+ *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
+ *q2p2 = _mm_andnot_si128(flat, *q2p2);
+ flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
+ *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
+ flat2_q[0] = _mm_add_epi16(
+ sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
+ flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+ flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+ flat2_pq[0] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+ flat2_pq[1] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+ flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
+ flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
+ sum_p = _mm_sub_epi16(sum_p, q4_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
+ flat2_p[2] = _mm_add_epi16(sum_p, work0);
+ flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[2] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+ flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q3_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
+ flat2_p[3] = _mm_add_epi16(sum_p, work0);
+ flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[3] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+ flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q2_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
+ flat2_p[4] = _mm_add_epi16(sum_p, work0);
+ flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[4] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+ flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q1_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
+ flat2_p[5] = _mm_add_epi16(sum_p, work0);
+ flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[5] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+ flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+ flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
+ *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
+ *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+ flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
+ *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
+ *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+ flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
+ *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
+ *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+ flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
+ *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
+ *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+ flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
+ *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
+ *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+ flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
+ *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
+ }
+ } else {
+ *q0p0 = qs0ps0;
+ *q1p1 = qs1ps1;
+ }
void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
@@ -761,22 +1241,22 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
__m128i limit = _mm_load_si128((const __m128i *)_limit);
__m128i thresh = _mm_load_si128((const __m128i *)_thresh);
- q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
+ q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
_mm_cvtsi32_si128(*(int *)(s + 4 * p)));
- q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
+ q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
_mm_cvtsi32_si128(*(int *)(s + 3 * p)));
- q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
+ q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
_mm_cvtsi32_si128(*(int *)(s + 2 * p)));
- q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+ q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
_mm_cvtsi32_si128(*(int *)(s + 1 * p)));
- q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+ q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
_mm_cvtsi32_si128(*(int *)(s - 0 * p)));
- q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
+ q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
_mm_cvtsi32_si128(*(int *)(s + 5 * p)));
- q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
+ q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
_mm_cvtsi32_si128(*(int *)(s + 6 * p)));
lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
@@ -790,7 +1270,7 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
store_buffer_horz_8(q5p5, p, 5, s);
-static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
__m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
__m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
__m128i *thresh) {
@@ -810,6 +1290,7 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
const __m128i one = _mm_set1_epi8(1);
const __m128i fe = _mm_set1_epi8(0xfe);
const __m128i ff = _mm_cmpeq_epi8(fe, fe);
// filter_mask and hev_mask
__m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
@@ -847,8 +1328,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
mask = _mm_subs_epu8(mask, *limit);
mask = _mm_cmpeq_epi8(mask, zero);
- // replicate for the further "merged variables" usage
- mask = _mm_unpacklo_epi64(mask, mask);
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
// flat_mask
flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
@@ -861,9 +1343,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
// 5 tap filter
- {
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
const __m128i four = _mm_set1_epi16(4);
__m128i workp_a, workp_b, workp_shft0, workp_shft1;
p2_16 = _mm_unpacklo_epi8(*p2, zero);
p1_16 = _mm_unpacklo_epi8(*p1, zero);
@@ -906,18 +1388,149 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+ *q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+ *p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
- // lp filter - the same for 6, 8 and 14 versions
- filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+ __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+ __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+ __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
+ __m128i ps1ps0, qs1qs0;
+ q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+ *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
+ *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ {
+ // filter_mask and hev_mask
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+ abs_p0q0 = abs_diff(*p1p0, *q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+ // considering sse doesn't have unsigned elements comparison the idea is
+ // to find at least one case when X > limit, it means the corresponding
+ // mask bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
- qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
- *q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
- ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
- *p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+ work = abs_diff(q2p2, q1p1);
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+ // flat_mask
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+ }
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_c;
+ __m128i pq0x2_pq1, pq1_pq2;
+ pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_srli_si128(pq0_16, 8);
+ q2_16 = _mm_srli_si128(pq2_16, 8);
+ // op1
+ pq0x2_pq1 =
+ _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1
+ pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+ pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
+ workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
+ workp_b =
+ _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+ // op0
+ workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+ workp_b = _mm_srli_epi16(workp_b, 3);
+ flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
+ // oq0
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
+ pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_srli_si128(pq1_pq2, 8);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+ // oq1
+ workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
+ pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4
+ workp_b = _mm_add_epi16(q2_16, q2_16);
+ workp_b =
+ _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
+ workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+ workp_a = _mm_srli_epi16(workp_a, 3);
+ flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+ *q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+ *p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+ }
void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
@@ -941,9 +1554,9 @@ void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
&limit, &thresh);
xx_storel_32(s - 1 * p, p1p0);
- xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+ xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
xx_storel_32(s + 0 * p, q1q0);
- xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
+ xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
@@ -970,8 +1583,8 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
- lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
- &limit, &thresh);
+ lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+ &limit, &thresh);
_mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
_mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
@@ -982,15 +1595,168 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
static AOM_FORCE_INLINE void lpf_internal_8_sse2(
__m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
__m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
- __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit,
- __m128i *thresh) {
+ __m128i *blimit, __m128i *limit, __m128i *thresh) {
const __m128i zero = _mm_setzero_si128();
__m128i mask, hev, flat;
__m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
flat_p1p0, flat_q0q1;
__m128i q2p2, q1p1, q0p0;
__m128i q1q0, p1p0, ps1ps0, qs1qs0;
- __m128i work_a, op2, oq2;
+ __m128i work_pq, opq2, pq2;
+ q3p3 = _mm_unpacklo_epi32(*p3, *q3);
+ q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+ p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0
+ q1q0 = _mm_srli_si128(p1p0, 8);
+ // filter_mask and hev_mask
+ // considering sse doesn't have unsigned elements comparison the idea is to
+ // find at least one case when X > limit, it means the corresponding mask
+ // bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+ // flat_mask4
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
+ p2_16 = _mm_unpacklo_epi8(*p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*p0, zero);
+ q0_16 = _mm_unpacklo_epi8(*q0, zero);
+ q1_16 = _mm_unpacklo_epi8(*q1, zero);
+ q2_16 = _mm_unpacklo_epi8(*q2, zero);
+ p3_16 = _mm_unpacklo_epi8(*p3, zero);
+ q3_16 = _mm_unpacklo_epi8(*q3, zero);
+ // op2
+ workp_a =
+ _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+ workp_shft2 = _mm_add_epi16(workp_a, workp_b);
+ // op1
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+ workp_c = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ // op0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+ workp_d = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+ flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
+ // oq0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+ // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ workp_c = _mm_add_epi16(workp_a, workp_b);
+ // oq1
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+ workp_d = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+ flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
+ // oq2
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+ workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+ workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+ opq2 = _mm_packus_epi16(workp_c, workp_c);
+ work_pq = _mm_andnot_si128(flat, q2p2);
+ pq2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_pq, pq2);
+ *q2 = _mm_srli_si128(*p2, 4);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
+static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+ __m128i *blimit, __m128i *limit, __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+ flat_p1p0, flat_q0q1;
+ __m128i q2p2, q1p1, q0p0;
+ __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+ __m128i work_pq, opq2, pq2;
q3p3 = _mm_unpacklo_epi64(*p3, *q3);
q2p2 = _mm_unpacklo_epi64(*p2, *q2);
@@ -1043,11 +1809,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
mask = _mm_subs_epu8(mask, *limit);
mask = _mm_cmpeq_epi8(mask, zero);
- // replicate for the further "merged variables" usage
- mask = _mm_unpacklo_epi64(mask, mask);
- // flat_mask4
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+ // flat_mask4
flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
flat = _mm_max_epu8(abs_p1p0, flat);
@@ -1059,11 +1825,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
flat = _mm_unpacklo_epi64(flat, flat);
- // filter8
- {
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
const __m128i four = _mm_set1_epi16(4);
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+ __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
p2_16 = _mm_unpacklo_epi8(*p2, zero);
p1_16 = _mm_unpacklo_epi8(*p1, zero);
p0_16 = _mm_unpacklo_epi8(*p0, zero);
@@ -1078,8 +1844,7 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
_mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
- workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- op2 = _mm_packus_epi16(workp_shft0, workp_shft0);
+ workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
// op1
workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
@@ -1108,27 +1873,22 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- oq2 = _mm_packus_epi16(workp_shft1, workp_shft1);
- }
- // lp filter - the same for 6, 8 and 14 versions
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
- qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+ work_pq = _mm_andnot_si128(flat, q2p2);
+ pq2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_pq, pq2);
+ *q2 = _mm_srli_si128(*p2, 8);
- ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
- work_a = _mm_andnot_si128(flat, *q2);
- q2_16 = _mm_and_si128(flat, oq2);
- *q2_out = _mm_or_si128(work_a, q2_16);
- work_a = _mm_andnot_si128(flat, *p2);
- p2_16 = _mm_and_si128(flat, op2);
- *p2_out = _mm_or_si128(work_a, p2_16);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
@@ -1136,7 +1896,7 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
const unsigned char *_limit,
const unsigned char *_thresh) {
__m128i p2, p1, p0, q0, q1, q2, p3, q3;
- __m128i q1q0, p1p0, p2_out, q2_out;
+ __m128i q1q0, p1p0;
__m128i blimit = _mm_load_si128((const __m128i *)_blimit);
__m128i limit = _mm_load_si128((const __m128i *)_limit);
__m128i thresh = _mm_load_si128((const __m128i *)_thresh);
@@ -1151,14 +1911,14 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
- &p2_out, &q2_out, &blimit, &limit, &thresh);
+ &blimit, &limit, &thresh);
xx_storel_32(s - 1 * p, p1p0);
- xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+ xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
xx_storel_32(s + 0 * p, q1q0);
- xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
- xx_storel_32(s - 3 * p, p2_out);
- xx_storel_32(s + 2 * p, q2_out);
+ xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+ xx_storel_32(s - 3 * p, p2);
+ xx_storel_32(s + 2 * p, q2);
void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
@@ -1196,8 +1956,8 @@ void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
_mm_loadl_epi64((__m128i *)(s + 6 * p)));
- lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
- &limit, &thresh);
+ lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+ &blimit, &limit, &thresh);
_mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
_mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
@@ -1227,7 +1987,7 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
_mm_load_si128((__m128i *)_thresh1));
__m128i p2, p1, p0, q0, q1, q2, p3, q3;
- __m128i q1q0, p1p0, p2_out, q2_out;
+ __m128i q1q0, p1p0;
p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
@@ -1238,15 +1998,15 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
- lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
- &p2_out, &q2_out, &blimit, &limit, &thresh);
+ lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
_mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
_mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
_mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
_mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
- _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out);
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out);
+ _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+ _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
@@ -1282,7 +2042,7 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
__m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
- lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+ lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
_mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
_mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
@@ -1331,7 +2091,7 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
- lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+ lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
p1 = _mm_srli_si128(ps1ps0, 8);
q1 = _mm_srli_si128(qs1qs0, 8);
@@ -1372,8 +2132,8 @@ void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
&limit, &thresh);
- p0 = _mm_srli_si128(p1p0, 8);
- q0 = _mm_srli_si128(q1q0, 8);
+ p0 = _mm_srli_si128(p1p0, 4);
+ q0 = _mm_srli_si128(q1q0, 4);
transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
@@ -1419,8 +2179,8 @@ void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
d5 = _mm_srli_si128(d4d5, 8);
d7 = _mm_srli_si128(d6d7, 8);
- lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit,
- &limit, &thresh);
+ lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
p0 = _mm_srli_si128(p1p0, 8);
q0 = _mm_srli_si128(q1q0, 8);
@@ -1444,7 +2204,7 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
const unsigned char *_thresh) {
__m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i p2, p0, q0, q2;
+ __m128i p0, q0;
__m128i x2, x1, x0, x3;
__m128i q1q0, p1p0;
__m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -1459,13 +2219,13 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
// Loop filtering
- lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2,
- &q2, &blimit, &limit, &thresh);
+ lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
- p0 = _mm_srli_si128(p1p0, 8);
- q0 = _mm_srli_si128(q1q0, 8);
+ p0 = _mm_srli_si128(p1p0, 4);
+ q0 = _mm_srli_si128(q1q0, 4);
- transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1,
+ transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
&d2, &d3);
_mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
@@ -1490,7 +2250,7 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
__m128i d1, d3, d5, d7;
__m128i q1q0, p1p0;
- __m128i p2, p1, q1, q2;
+ __m128i p1, q1;
__m128i d0d1, d2d3, d4d5, d6d7;
x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
@@ -1510,14 +2270,14 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
d5 = _mm_srli_si128(d4d5, 8);
d7 = _mm_srli_si128(d6d7, 8);
- lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0,
- &p1p0, &p2, &q2, &blimit, &limit, &thresh);
+ lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
+ &q1q0, &p1p0, &blimit, &limit, &thresh);
p1 = _mm_srli_si128(p1p0, 8);
q1 = _mm_srli_si128(q1q0, 8);
- transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3,
- &d4d5, &d6d7);
+ transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
+ &d2d3, &d4d5, &d6d7);
_mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
_mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
@@ -1533,65 +2293,30 @@ void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh) {
- __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
- __m128i x6, x5, x4, x3, x2, x1, x0;
- __m128i p0, p1, p2, p3, p4, p5, p6, p7;
- __m128i q0, q1, q2, q3, q4, q5, q6, q7;
- __m128i p0_out, p1_out, p2_out, p3_out;
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+ __m128i x6, x5, x4, x3;
+ __m128i pq0, pq1, pq2, pq3;
__m128i blimit = _mm_load_si128((__m128i *)_blimit);
__m128i limit = _mm_load_si128((__m128i *)_limit);
__m128i thresh = _mm_load_si128((__m128i *)_thresh);
- x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p));
- x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p));
- x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p));
- transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6,
- &p7);
+ x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+ x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+ x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+ x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
- x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
- x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
- transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6,
- &q7);
- q6p6 = _mm_unpacklo_epi64(p1, q6);
- q5p5 = _mm_unpacklo_epi64(p2, q5);
- q4p4 = _mm_unpacklo_epi64(p3, q4);
- q3p3 = _mm_unpacklo_epi64(p4, q3);
- q2p2 = _mm_unpacklo_epi64(p5, q2);
- q1p1 = _mm_unpacklo_epi64(p6, q1);
- q0p0 = _mm_unpacklo_epi64(p7, q0);
+ transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
+ &q5p5, &q6p6, &q7p7);
lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
&limit, &thresh);
- transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
- &p0_out, &p1_out, &p2_out, &p3_out);
- x0 = _mm_srli_si128(q0p0, 8);
- x1 = _mm_srli_si128(q1p1, 8);
- x2 = _mm_srli_si128(q2p2, 8);
- x3 = _mm_srli_si128(q3p3, 8);
- x4 = _mm_srli_si128(q4p4, 8);
- x5 = _mm_srli_si128(q5p5, 8);
- x6 = _mm_srli_si128(q6p6, 8);
- transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2,
- &q3);
- _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out);
- _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out);
- _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out);
- _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out);
- _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
- _mm_storel_epi64((__m128i *)(s + 3 * p), q3);
+ transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+ &q0p0, &pq0, &pq1, &pq2, &pq3);
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
void aom_lpf_vertical_14_dual_sse2(
@@ -1634,8 +2359,8 @@ void aom_lpf_vertical_14_dual_sse2(
q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
q7 = _mm_srli_si128(d14d15, 8);
- lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
- &limit, &thresh);
+ lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+ &blimit, &limit, &thresh);
x0 = _mm_srli_si128(q0p0, 8);
x1 = _mm_srli_si128(q1p1, 8);
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
index c6b6469b4..8970fe7dd 100644
--- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
-#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H
-#define _AOM_DSP_X86_LPF_COMMON_X86_H
#include <emmintrin.h> // SSE2
@@ -212,4 +212,4 @@ static INLINE void highbd_transpose8x16_sse2(
d4 + 1, d5 + 1, d6 + 1, d7 + 1);
-#endif // _AOM_DSP_X86_LPF_COMMON_X86_H
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
index 6538e4d5e..584b5e7e3 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -9,7 +9,6 @@
* PATENTS file, you can obtain it at
-#include <stdio.h>
#include <tmmintrin.h>
#include "config/aom_config.h"
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
index 19b429d91..cffbd9672 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
const uint8_t *a_ptr, int a_stride,
@@ -30,4 +30,4 @@ unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
const uint8_t *m_ptr, int m_stride,
int height);
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
index dc41a8342..4faa098ac 100644
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
#include <stdlib.h>
#include <string.h>
@@ -89,4 +89,4 @@ static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
} while (i < height);
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
index 8b69606dd..6c821673e 100644
--- a/third_party/aom/aom_dsp/x86/mem_sse2.h
+++ b/third_party/aom/aom_dsp/x86/mem_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
-#ifndef AOM_DSP_X86_MEM_SSE2_H_
-#define AOM_DSP_X86_MEM_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_AOM_DSP_X86_MEM_SSE2_H_
#include <emmintrin.h> // SSE2
@@ -39,4 +39,4 @@ static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
return dst;
-#endif // AOM_DSP_X86_MEM_SSE2_H_
+#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
new file mode 100644
index 000000000..5181e444c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -0,0 +1,58 @@
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include <smmintrin.h>
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int h) {
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+ assert(IS_POWER_OF_TWO(h));
+ do {
+ const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
+ const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
+ const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
+ const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+ const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+ n += 4;
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * h);
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
index a3535f985..48486c6c4 100644
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
#include <immintrin.h>
@@ -42,4 +42,13 @@ static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
+ const __m128i v_tmp_d =
+ _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
new file mode 100644
index 000000000..bfec0e8a8
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
@@ -0,0 +1,190 @@
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include <assert.h>
+#include <immintrin.h>
+#include "config/aom_config.h"
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+// 8 bit
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ int n = 0, width, height = h;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ __m128i v_d;
+ const uint8_t *pre_temp;
+ assert(w >= 8);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+ do {
+ width = w;
+ pre_temp = pre;
+ do {
+ const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
+ const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+ const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+ const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
+ const __m256i v_tmp_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
+ const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
+ const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
+ const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
+ const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
+ const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+ pre_temp += 8;
+ n += 8;
+ width -= 8;
+ } while (width > 0);
+ pre += pre_stride;
+ height -= 1;
+ } while (height > 0);
+ v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ *sum = _mm_cvtsi128_si32(v_d);
+ *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
+static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ int n = 0, width, height = h;
+ __m256i v_d;
+ __m128i res0;
+ const uint8_t *pre_temp;
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+ assert(w >= 16);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+ do {
+ width = w;
+ pre_temp = pre;
+ do {
+ const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
+ const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+ const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+ const __m256i v_m1_d =
+ _mm256_loadu_si256((__m256i const *)(mask + n + 8));
+ const __m256i v_w1_d =
+ _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+ const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
+ const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+ const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+ const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
+ const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
+ const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
+ const __m256i v_tmp0_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
+ const __m256i v_tmp1_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
+ const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
+ const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
+ const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
+ const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
+ const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+ v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
+ pre_temp += 16;
+ n += 16;
+ width -= 16;
+ } while (width > 0);
+ pre += pre_stride;
+ height -= 1;
+ } while (height > 0);
+ v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
+ v_d = _mm256_hadd_epi32(v_d, v_d);
+ res0 = _mm256_castsi256_si128(v_d);
+ res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
+ *sum = _mm_cvtsi128_si32(res0);
+ *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
+#define OBMCVARWXH(W, H) \
+ unsigned int aom_obmc_variance##W##x##H##_avx2( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ if (W == 4) { \
+ obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
+ } else if (W == 8) { \
+ obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } else { \
+ obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } \
+ \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ }
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
index 2e2f6e09f..72eda0e57 100644
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -19,7 +19,7 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
#include "aom_dsp/x86/synonyms.h"
@@ -36,45 +36,6 @@ void aom_var_filter_block2d_bil_second_pass_ssse3(
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter);
-static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
- const int32_t *wsrc, const int32_t *mask,
- unsigned int *const sse, int *const sum,
- const int h) {
- const int pre_step = pre_stride - 4;
- int n = 0;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_d = _mm_setzero_si128();
- assert(IS_POWER_OF_TWO(h));
- do {
- const __m128i v_p_b = xx_loadl_32(pre + n);
- const __m128i v_m_d = xx_load_128(mask + n);
- const __m128i v_w_d = xx_load_128(wsrc + n);
- const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
- const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
- const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
- const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
- v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
- n += 4;
- if (n % 4 == 0) pre += pre_step;
- } while (n < 4 * h);
- *sum = xx_hsum_epi32_si32(v_sum_d);
- *sse = xx_hsum_epi32_si32(v_sse_d);
static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
const int32_t *wsrc, const int32_t *mask,
unsigned int *const sse, int *const sum,
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
index e6b40262d..216a0bd8f 100644
--- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -16,16 +16,12 @@
%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
shift, qcoeff, dqcoeff, dequant, \
eob, scan, iscan
- ; If we can skip this block, then just zero the output
- cmp skipmp, 0
- jne .blank
%ifnidn %1, b_32x32
; Special case for ncoeff == 16, as it is frequent and we can save on
@@ -83,14 +79,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
; Actual quantization of size 16 block - setup pointers, rounders, etc.
- movifnidn r4, roundmp
- movifnidn r5, quantmp
- mov r3, dequantmp
- mov r6, shiftmp
- mova m1, [r4] ; m1 = round
- mova m2, [r5] ; m2 = quant
- mova m3, [r3] ; m3 = dequant
- mova m4, [r6] ; m4 = shift
+ movifnidn r3, roundmp
+ movifnidn r4, quantmp
+ mov r6, dequantmp
+ mov r5, shiftmp
+ mova m1, [r3] ; m1 = round
+ mova m2, [r4] ; m2 = quant
+ mova m3, [r6] ; m3 = dequant
+ mova m4, [r5] ; m4 = shift
mov r3, iscanmp
@@ -174,20 +170,20 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
%endif ; %ifnidn %1, b_32x32
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
qcoeff, dqcoeff, dequant, eob, scan, iscan
; Actual quantization loop - setup pointers, rounders, etc.
movifnidn coeffq, coeffmp
movifnidn ncoeffq, ncoeffmp
- mov r2, dequantmp
movifnidn zbinq, zbinmp
movifnidn roundq, roundmp
movifnidn quantq, quantmp
+ movifnidn dequantq, dequantmp
mova m0, [zbinq] ; m0 = zbin
mova m1, [roundq] ; m1 = round
mova m2, [quantq] ; m2 = quant
- mova m3, [r2] ; m3 = dequant
+ mova m3, [dequantq] ; m3 = dequant
pcmpeqw m4, m4 ; All lanes -1
%ifidn %1, b_32x32
psubw m0, m4
@@ -199,7 +195,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
mov r2, shiftmp
mov r3, qcoeffmp
- mova m4, [r2] ; m4 = shift
+ mova m4, [r2] ; m4 = shift
mov r4, dqcoeffmp
mov r5, iscanmp
%ifidn %1, b_32x32
@@ -207,7 +203,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
lea coeffq, [ coeffq+ncoeffq*4]
@@ -432,39 +428,8 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
mov [r2], ax
- ; Skip-block, i.e. just write all zeroes
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
- qcoeff, dqcoeff, dequant, eob, scan, iscan
- mov r0, dqcoeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, qcoeffmp
- mov r3, eobmp
-DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
- lea dqcoeffq, [dqcoeffq+ncoeffq*4]
- lea qcoeffq, [ qcoeffq+ncoeffq*4]
- neg ncoeffq
- pxor m7, m7
- mova [dqcoeffq+ncoeffq*4+ 0], ymm7
- mova [dqcoeffq+ncoeffq*4+32], ymm7
- mova [qcoeffq+ncoeffq*4+ 0], ymm7
- mova [qcoeffq+ncoeffq*4+32], ymm7
- add ncoeffq, mmsize
- jl .blank_loop
- mov [eobq], word 0
- vzeroupper
-QUANTIZE_FN b_32x32, 7
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
index 46b9c7d29..d3de6e24d 100644
--- a/third_party/aom/aom_dsp/x86/quantize_sse2.c
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -9,242 +9,139 @@
* PATENTS file, you can obtain it at
+#include <assert.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
- if (sizeof(tran_low_t) == 4) {
- return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
- (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
- (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
- (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
- } else {
- return _mm_load_si128((const __m128i *)coeff_ptr);
- }
+ assert(sizeof(tran_low_t) == 4);
+ return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+ (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
+ (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
+ (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
static INLINE void store_coefficients(__m128i coeff_vals,
tran_low_t *coeff_ptr) {
- if (sizeof(tran_low_t) == 4) {
- __m128i one = _mm_set1_epi16(1);
- __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
- __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
- __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
- __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
- _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
- } else {
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
- }
+ assert(sizeof(tran_low_t) == 4);
+ __m128i one = _mm_set1_epi16(1);
+ __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+ __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+ __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+ __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+ _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan_ptr,
const int16_t *iscan_ptr) {
- __m128i zero;
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
- coeff_ptr += n_coeffs;
- iscan_ptr += n_coeffs;
- qcoeff_ptr += n_coeffs;
- dqcoeff_ptr += n_coeffs;
- n_coeffs = -n_coeffs;
- zero = _mm_setzero_si128();
- if (!skip_block) {
- __m128i eob;
- __m128i zbin;
- __m128i round, quant, dequant, shift;
- {
- __m128i coeff0, coeff1;
- // Setup global values
- {
- __m128i pw_1;
- zbin = _mm_load_si128((const __m128i *)zbin_ptr);
- round = _mm_load_si128((const __m128i *)round_ptr);
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- pw_1 = _mm_set1_epi16(1);
- zbin = _mm_sub_epi16(zbin, pw_1);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
- }
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- __m128i cmp_mask0, cmp_mask1;
- // Do DC and first 15 AC
- coeff0 = load_coefficients(coeff_ptr + n_coeffs);
- coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
- zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
- cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
- qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
- qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
- shift = _mm_unpackhi_epi64(shift, shift);
- qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- // Mask out zbin threshold coeffs
- qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
- qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
- store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
- store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
- store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
- }
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob = _mm_max_epi16(eob, eob1);
- }
- n_coeffs += 8 * 2;
- }
- // AC only loop
- while (n_coeffs < 0) {
- __m128i coeff0, coeff1;
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- __m128i cmp_mask0, cmp_mask1;
- coeff0 = load_coefficients(coeff_ptr + n_coeffs);
- coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
- cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
- qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
- qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
- qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- // Mask out zbin threshold coeffs
- qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
- qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
- store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
- store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
- store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
- }
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob0 = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob0 = _mm_max_epi16(eob0, eob1);
- eob = _mm_max_epi16(eob, eob0);
- }
- n_coeffs += 8 * 2;
- }
- // Accumulate EOB
- {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- *eob_ptr = _mm_extract_epi16(eob, 1);
- }
- } else {
- do {
- store_coefficients(zero, dqcoeff_ptr + n_coeffs);
- store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
- store_coefficients(zero, qcoeff_ptr + n_coeffs);
- store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
- n_coeffs += 8 * 2;
- } while (n_coeffs < 0);
- *eob_ptr = 0;
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+ store_coefficients(coeff0, dqcoeff_ptr);
+ store_coefficients(coeff1, dqcoeff_ptr + 8);
+ eob =
+ scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+ store_coefficients(coeff0, dqcoeff_ptr + index);
+ store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+ index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ index += 16;
+ *eob_ptr = accumulate_eob(eob);
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
index e2c1ebb71..39d4ca674 100644
--- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -18,21 +18,18 @@ pw_1: times 8 dw 1
-; TODO(yunqingwang)fix quantize_b code for skip=1 case.
%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
shift, qcoeff, dqcoeff, dequant, \
eob, scan, iscan
- cmp dword skipm, 0
- jne .blank
; actual quantize loop - setup pointers, rounders, etc.
movifnidn coeffq, coeffmp
movifnidn ncoeffq, ncoeffmp
- mov r2, dequantmp
movifnidn zbinq, zbinmp
movifnidn roundq, roundmp
movifnidn quantq, quantmp
+ movifnidn dequantq, dequantmp
mova m0, [zbinq] ; m0 = zbin
mova m1, [roundq] ; m1 = round
mova m2, [quantq] ; m2 = quant
@@ -44,18 +41,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psrlw m0, 1 ; m0 = (m0 + 1) / 2
psrlw m1, 1 ; m1 = (m1 + 1) / 2
- mova m3, [r2q] ; m3 = dequant
- psubw m0, [GLOBAL(pw_1)]
+ mova m3, [dequantq] ; m3 = dequant
mov r2, shiftmp
- mov r3, qcoeffmp
+ psubw m0, [GLOBAL(pw_1)]
mova m4, [r2] ; m4 = shift
+ mov r3, qcoeffmp
mov r4, dqcoeffmp
mov r5, iscanmp
%ifidn %1, b_32x32
psllw m4, 1
pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
lea coeffq, [ coeffq+ncoeffq*4]
lea qcoeffq, [ qcoeffq+ncoeffq*4]
lea dqcoeffq, [dqcoeffq+ncoeffq*4]
@@ -268,33 +265,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
pextrw r6, m8, 0
mov [r2], r6
- ; skip-block, i.e. just write all zeroes
- mov r0, dqcoeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, qcoeffmp
- mov r3, eobmp
- DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
- lea dqcoeffq, [dqcoeffq+ncoeffq*4]
- lea qcoeffq, [ qcoeffq+ncoeffq*4]
- neg ncoeffq
- pxor m7, m7
- mova [dqcoeffq+ncoeffq*4+ 0], m7
- mova [dqcoeffq+ncoeffq*4+16], m7
- mova [dqcoeffq+ncoeffq*4+32], m7
- mova [dqcoeffq+ncoeffq*4+48], m7
- mova [qcoeffq+ncoeffq*4+ 0], m7
- mova [qcoeffq+ncoeffq*4+16], m7
- mova [qcoeffq+ncoeffq*4+32], m7
- mova [qcoeffq+ncoeffq*4+48], m7
- add ncoeffq, mmsize
- jl .blank_loop
- mov word [eobq], 0
INIT_XMM ssse3
-QUANTIZE_FN b_32x32, 7
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h
new file mode 100644
index 000000000..4eed7dd29
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_x86.h
@@ -0,0 +1,77 @@
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+ const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr, __m128i *dequant,
+ const int16_t *shift_ptr, __m128i *shift) {
+ *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)shift_ptr);
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi16(a, sign);
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+ const __m128i quant, const __m128i shift) {
+ __m128i tmp, qcoeff;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ *coeff = _mm_mulhi_epi16(qcoeff, shift);
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+ return _mm_mullo_epi16(qcoeff, dequant);
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+ const __m128i zbin_mask0,
+ const __m128i zbin_mask1,
+ const int16_t *scan_ptr, const int index,
+ const __m128i zero) {
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+ __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+ __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+ __m128i eob0, eob1;
+ // Add one to convert from indices to counts
+ scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+ scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+ eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+ eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+ return _mm_max_epi16(eob0, eob1);
+static INLINE int16_t accumulate_eob(__m128i eob) {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c
new file mode 100644
index 000000000..305dde5c0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_avx2.c
@@ -0,0 +1,250 @@
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include <smmintrin.h>
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+ const uint8_t *b) {
+ const __m256i v_a0 = yy_loadu_256(a);
+ const __m256i v_b0 = yy_loadu_256(b);
+ const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0));
+ const __m256i v_a01_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1));
+ const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0));
+ const __m256i v_b01_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1));
+ const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+ const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+ int64_t sum;
+ const __m256i sum0_4x64 =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all));
+ const __m256i sum1_4x64 =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1));
+ const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+ const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+ _mm256_extracti128_si256(sum_4x64, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ __m256i sum = _mm256_setzero_si256();
+ switch (width) {
+ case 4:
+ do {
+ const __m128i v_a0 = xx_loadl_32(a);
+ const __m128i v_a1 = xx_loadl_32(a + a_stride);
+ const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
+ const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
+ const __m128i v_b0 = xx_loadl_32(b);
+ const __m128i v_b1 = xx_loadl_32(b + b_stride);
+ const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
+ const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
+ const __m128i v_a0123 = _mm_unpacklo_epi64(
+ _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3));
+ const __m128i v_b0123 = _mm_unpacklo_epi64(
+ _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3));
+ const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+ const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 8:
+ do {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m256i v_a_w =
+ _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+ const __m256i v_b_w =
+ _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 16:
+ do {
+ const __m128i v_a0 = xx_loadu_128(a);
+ const __m128i v_b0 = xx_loadu_128(b);
+ const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0);
+ const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 32:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 64:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ sse_w32_avx2(&sum, a + 32, b + 32);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 128:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ sse_w32_avx2(&sum, a + 32, b + 32);
+ sse_w32_avx2(&sum, a + 64, b + 64);
+ sse_w32_avx2(&sum, a + 96, b + 96);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ default: break;
+ }
+ return sse;
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+ const uint16_t *b) {
+ const __m256i v_a_w = yy_loadu_256(a);
+ const __m256i v_b_w = yy_loadu_256(b);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+ int b_stride, int width, int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ __m256i sum = _mm256_setzero_si256();
+ switch (width) {
+ case 4:
+ do {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
+ const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
+ const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
+ const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
+ _mm_unpacklo_epi64(v_a2, v_a3));
+ const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
+ _mm_unpacklo_epi64(v_b2, v_b3));
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 8:
+ do {
+ const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
+ const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 16:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 32:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ highbd_sse_w16_avx2(&sum, a + 16, b + 16);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 64:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
+ highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
+ highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 128:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
+ highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
+ highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
+ highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4);
+ highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5);
+ highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6);
+ highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ default: break;
+ }
+ return sse;
diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c
new file mode 100644
index 000000000..8b5af8469
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_sse4.c
@@ -0,0 +1,241 @@
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include <assert.h>
+#include <smmintrin.h>
+#include "config/aom_config.h"
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+ int64_t sum;
+ const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+ const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+ const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+ const uint8_t *b) {
+ const __m128i v_a0 = xx_loadu_128(a);
+ const __m128i v_b0 = xx_loadu_128(b);
+ const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+ const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+ const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+ const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+ const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+ const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int y = 0;
+ int64_t sse = 0;
+ __m128i sum = _mm_setzero_si128();
+ switch (width) {
+ case 4:
+ do {
+ const __m128i v_a0 = xx_loadl_32(a);
+ const __m128i v_a1 = xx_loadl_32(a + a_stride);
+ const __m128i v_b0 = xx_loadl_32(b);
+ const __m128i v_b1 = xx_loadl_32(b + b_stride);
+ const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+ const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 8:
+ do {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+ const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 16:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 32:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16, b + 16);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 64:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+ sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+ sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 128:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+ sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+ sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+ sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
+ sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
+ sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
+ sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ default: break;
+ }
+ return sse;
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+ const uint16_t *b) {
+ const __m128i v_a_w = xx_loadu_128(a);
+ const __m128i v_b_w = xx_loadu_128(b);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int width,
+ int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ __m128i sum = _mm_setzero_si128();
+ switch (width) {
+ case 4:
+ do {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+ const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 8:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 16:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ highbd_sse_w8_sse4_1(&sum, a + 8, b + 8);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 32:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 64:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 128:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ default: break;
+ }
+ return sse;
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
new file mode 100644
index 000000000..0af44e3a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
@@ -0,0 +1,79 @@
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include <immintrin.h>
+#include <smmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
+ int width, int height) {
+ uint64_t result;
+ __m256i v_acc_q = _mm256_setzero_si256();
+ const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff);
+ for (int col = 0; col < height; col += 4) {
+ __m256i v_acc_d = _mm256_setzero_si256();
+ for (int row = 0; row < width; row += 16) {
+ const int16_t *tempsrc = src + row;
+ const __m256i v_val_0_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+ const __m256i v_val_1_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+ const __m256i v_val_2_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+ const __m256i v_val_3_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+ const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
+ v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
+ }
+ v_acc_q =
+ _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
+ src += 4 * stride;
+ }
+ __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
+ __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
+ __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
+ result_64_2_int = _mm_add_epi64(
+ result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
+ xx_storel_64(&result, result_64_2_int);
+ return result;
+uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
+ int height) {
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+ } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+ } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+ return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
+ } else {
+ return aom_sum_squares_2d_i16_c(src, stride, width, height);
+ }
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
index a79f22d79..22d7739ec 100644
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -14,6 +14,7 @@
#include <stdio.h>
#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
#include "config/aom_dsp_rtcd.h"
static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
@@ -44,8 +45,7 @@ static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
-static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
- int stride) {
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
__m128i v_sum_d =
_mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
@@ -53,8 +53,8 @@ static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
-static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
- int height) {
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+ int height) {
int r = 0;
__m128i v_acc_q = _mm_setzero_si128();
do {
@@ -76,7 +76,7 @@ static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
// maintenance instructions in the common case of 4x4.
-static uint64_t
aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
int height) {
int r = 0;
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
new file mode 100644
index 000000000..491e31cc5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
@@ -0,0 +1,22 @@
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
+ int width, int height);
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+ int height);
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
+#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
index d9a53fcc5..1e9f1e27b 100644
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
-#ifndef AOM_DSP_X86_SYNONYMS_H_
-#define AOM_DSP_X86_SYNONYMS_H_
#include <immintrin.h>
@@ -103,15 +103,6 @@ static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
return _mm_srai_epi32(v_tmp_d, bits);
-// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
- const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
- const __m128i v_tmp_d =
- _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
- return _mm_srai_epi32(v_tmp_d, bits);
static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
@@ -120,4 +111,4 @@ static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
return _mm_srai_epi16(v_tmp_d, bits);
-#endif // AOM_DSP_X86_SYNONYMS_H_
+#endif // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
index 39f371fc9..3f69b120e 100644
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
#include <immintrin.h>
@@ -61,4 +61,14 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
-#endif // AOM_DSP_X86_SYNONYMS_AVX2_H_
+static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+ __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
+ __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
+ return yy_set_m128i(mhi, mlo);
+static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+ const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
+ return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
index f88a1527d..d0d1ee684 100644
--- a/third_party/aom/aom_dsp/x86/transpose_sse2.h
+++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
#include <emmintrin.h> // SSE2
@@ -417,4 +417,4 @@ static INLINE void transpose_32bit_8x4(const __m128i *const in,
out[7] = _mm_unpackhi_epi64(a6, a7);
-#endif // AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
index bdff64b8f..b1611ba87 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
#include <emmintrin.h>
#include "aom/aom_integer.h"
@@ -196,4 +196,4 @@ static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
-#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
index 58a792424..ed82eee96 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at
#include <emmintrin.h>
#include "aom/aom_integer.h"
@@ -26,4 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) {
return _mm_shuffle_epi32(b, 0x4e);
-#endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
index a7ac2c93d..800aef126 100644
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -433,13 +433,14 @@ static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
return comp;
-void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8,
+void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
int width, int height, const uint8_t *ref8,
int ref_stride, const uint8_t *mask,
int mask_stride, int invert_mask) {
int i = 0;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
const uint16_t *src0 = invert_mask ? pred : ref;
const uint16_t *src1 = invert_mask ? ref : pred;
const int stride0 = invert_mask ? width : ref_stride;
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index 7e3c5d5db..3c37e77c0 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -16,6 +16,7 @@
#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"
+#include "aom_dsp/blend.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_ports/mem.h"
@@ -485,7 +486,8 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
int mi_row, int mi_col, const MV *const mv,
uint8_t *comp_pred, int width, int height,
int subpel_x_q3, int subpel_y_q3,
- const uint8_t *ref, int ref_stride) {
+ const uint8_t *ref, int ref_stride,
+ int subpel_search) {
// expect xd == NULL only in tests
if (xd != NULL) {
const MB_MODE_INFO *mi = xd->mi[0];
@@ -553,7 +555,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
// Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
const InterpFilters filters =
@@ -570,7 +572,10 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
const InterpFilterParams *filter =
- av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ (subpel_search == 1)
+ ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+ : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS;
if (!subpel_x_q3 && !subpel_y_q3) {
if (width >= 16) {
@@ -632,15 +637,25 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
const int16_t *const kernel_y =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- const int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+ uint8_t *temp_start_horiz =
+ (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp;
+ uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+ int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
- width, intermediate_height);
- aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
- MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
- width, height);
+ // TODO(Deepa): Remove the memset below when we have
+ // 4 tap simd for sse2 and ssse3.
+ if (subpel_search == 1) {
+ memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width);
+ }
+ aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+ kernel_x, 16, NULL, -1, width, intermediate_height);
+ aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+ kernel_y, 16, width, height);
@@ -648,11 +663,11 @@ void aom_comp_avg_upsampled_pred_sse2(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride) {
+ int ref_stride, int subpel_search) {
int n;
int i;
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride);
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
/*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
assert(!(width * height & 15));
n = width * height >> 4;
@@ -664,3 +679,128 @@ void aom_comp_avg_upsampled_pred_sse2(
pred += 16;
+void aom_comp_mask_upsampled_pred_sse2(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+ int subpel_search) {
+ if (subpel_x_q3 | subpel_y_q3) {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+ ref = comp_pred;
+ ref_stride = width;
+ }
+ aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+ mask_stride, invert_mask);
+static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
+ const __m128i s1,
+ const __m128i a) {
+ const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
+ const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
+ const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
+ const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
+ const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
+ const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
+ const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
+ const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
+ return comp;
+void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ int i = 0;
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ const __m128i zero = _mm_setzero_si128();
+ if (width == 8) {
+ do {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+ const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
+ const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
+ _mm_storeu_si128((__m128i *)comp_pred, comp);
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+ const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+ const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
+ const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
+ const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+ const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+ _mm_storeu_si128((__m128i *)comp_pred, comp);
+ _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else if (width == 32) {
+ do {
+ for (int j = 0; j < 2; j++) {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
+ const __m128i s2 =
+ _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
+ const __m128i s3 =
+ _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
+ const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
+ const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+ const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+ _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+ _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+ }
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ }