diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86')
51 files changed, 5060 insertions, 2751 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c index 401fbdc48..5f5bf5f14 100644 --- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c +++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c @@ -22,6 +22,13 @@ filter8_1dfunction aom_filter_block1d8_h8_sse2; filter8_1dfunction aom_filter_block1d4_v8_sse2; filter8_1dfunction aom_filter_block1d4_h8_sse2; +#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2 +#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2 +#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2 +#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2 +#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2 +#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2 + filter8_1dfunction aom_filter_block1d16_v2_sse2; filter8_1dfunction aom_filter_block1d16_h2_sse2; filter8_1dfunction aom_filter_block1d8_v2_sse2; diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c index f3fe50372..94b5da171 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -74,6 +74,87 @@ static INLINE void xx_store2_mi128(const uint8_t *output_ptr, _mm256_extractf128_si256(*a, 1)); } +static void aom_filter_block1d4_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + static void aom_filter_block1d4_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -179,6 +260,100 @@ static void aom_filter_block1d4_h8_avx2( } } +static void aom_filter_block1d8_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + static void aom_filter_block1d8_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -311,6 +486,121 @@ static void aom_filter_block1d8_h8_avx2( } } +static void aom_filter_block1d16_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m256i srcReg1, srcReg12; + __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1; + + srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr)); + srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94); + + // filter the source buffer + srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg); + srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters); + srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32); + srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1); + srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, + _mm256_castsi256_si128(srcRegFilt1_1)); + } +} + static void aom_filter_block1d16_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -507,6 +797,92 @@ static void aom_filter_block1d16_h8_avx2( } } +static void aom_filter_block1d8_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i resReg23_34_lo, resReg45_56_lo; + __m256i resReglo, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + static void aom_filter_block1d8_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { @@ -659,6 +1035,104 @@ static void aom_filter_block1d8_v8_avx2( } } +static void aom_filter_block1d16_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi; + __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi; + __m256i resReglo, resReghi, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters); + resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters); + + // add and saturate the results together + resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReghi = _mm256_adds_epi16(resReghi, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + resReghi = _mm256_srai_epi16(resReghi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReghi); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg23_34_hi = srcReg45_56_hi; + srcReg4x = srcReg6x; + } +} + static void aom_filter_block1d16_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { @@ -854,6 +1328,88 @@ static void aom_filter_block1d16_v8_avx2( } } +static void aom_filter_block1d4_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i srcReg2345_3456_lo; + __m256i resReglo, resReg; + __m256i firstFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters); + + resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d16_v2_ssse3; diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c index 6bcb4a512..325a21b76 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -287,6 +287,13 @@ filter8_1dfunction aom_filter_block1d8_h8_ssse3; filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d4_h8_ssse3; +#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3 +#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3 +#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3 +#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3 +#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3 +#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3 + filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; filter8_1dfunction aom_filter_block1d8_v2_ssse3; diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c new file mode 100644 index 000000000..67fb4d32b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c @@ -0,0 +1,900 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <smmintrin.h> // SSE4.1 +#include <immintrin.h> // AVX2 + +#include <assert.h> + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w16_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval, + int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s1_0 = yy_loadu_256(src1); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + __m256i res = _mm256_packus_epi16(res0, res0); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res)); +} + +static INLINE void blend_a64_d16_mask_w32_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset, + const __m256i *v_maxval, int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s0_1 = yy_loadu_256(src0 + 16); + const __m256i s1_0 = yy_loadu_256(src1); + const __m256i s1_1 = yy_loadu_256(src1 + 16); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1), + _mm256_unpacklo_epi16(*m1, max_minus_m1)); + __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1), + _mm256_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi); + __m256i res = _mm256_packus_epi16(res0, res1); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm256_storeu_si256((__m256i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m = xx_loadu_128(mask); + const __m256i m0 = _mm256_cvtepu8_epi16(m); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m = yy_loadu_256(mask + j); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)); + const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m256i m_i00 = yy_loadu_256(mask); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j); + const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(m_ac); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + j); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j); + + const __m256i m_ac = + _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac)); + const __m256i m1 = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +void aom_lowbd_blend_a64_d16_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + const __m256i y_round_offset = _mm256_set1_epi32(round_offset); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } +} + +static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0)); + const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1)); + const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8); + const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w); + const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8); + return v_res; +} + +static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = yy_loadu_256(src0); + const __m256i v_s1_b = yy_loadu_256(src1); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m256i v_p1_w = + _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b), + _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits); + const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + +static INLINE void blend_a64_mask_sx_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m256i v_ral_b = yy_loadu_256(mask); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + + const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ral_b = yy_loadu_256(mask + 2 * c); + const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c); + const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rvsbh_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2); + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sx_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xff); + do { + const __m256i v_rl_b = yy_loadu_256(mask); + const __m256i v_al_b = + _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1)); + + const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256()); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_r0_b = yy_loadu_256(mask + 2 * c); + const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m256i v_al_b = + _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8)); + const __m256i v_ah_b = + _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8)); + + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ra_b = yy_loadu_256(mask + c); + const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride); + const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +static INLINE void blend_a64_mask_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_m0_b = yy_loadu_256(mask + c); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + do { + const __m128i v_m0_b = xx_loadu_128(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + default: + blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subx, int suby) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h, subx, suby); + } else { + if (subx & suby) { + blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (subx) { + blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (suby) { + blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else { + blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h); + } + } +} diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c index 49c20b467..9d6b4c2f7 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c @@ -20,6 +20,7 @@ #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" #include "config/aom_dsp_rtcd.h" @@ -32,19 +33,13 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_m0_b = xx_loadl_32(mask); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; @@ -59,19 +54,13 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_m0_b = xx_loadl_64(mask); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; @@ -85,23 +74,17 @@ static void blend_a64_mask_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { - const __m128i v_m0l_b = xx_loadl_64(mask + c); - const __m128i v_m0h_b = xx_loadl_64(mask + c + 8); - const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b); - const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + const __m128i v_m0_b = xx_loadu_128(mask + c); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -120,23 +103,20 @@ static void blend_a64_mask_sx_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_r_b = xx_loadl_64(mask); - const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); - - const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; @@ -150,22 +130,20 @@ static void blend_a64_mask_sx_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_r_b = xx_loadu_128(mask); - const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); - - const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -180,28 +158,24 @@ static void blend_a64_mask_sx_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { - const __m128i v_rl_b = xx_loadu_128(mask + 2 * c); - const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16); - const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1)); - const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1)); - - const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b); - const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_r0_b = xx_loadu_128(mask + 2 * c); + const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -220,21 +194,18 @@ static void blend_a64_mask_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { const __m128i v_ra_b = xx_loadl_32(mask); const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); @@ -249,21 +220,16 @@ static void blend_a64_mask_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); - - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -278,26 +244,18 @@ static void blend_a64_mask_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zero = _mm_setzero_si128(); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { const __m128i v_ra_b = xx_loadu_128(mask + c); const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -316,27 +274,24 @@ static void blend_a64_mask_sx_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); (void)w; do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = - _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); - const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); - + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); @@ -351,27 +306,25 @@ static void blend_a64_mask_sx_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); (void)w; do { const __m128i v_ra_b = xx_loadu_128(mask); const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); - const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = - _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); - const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -388,8 +341,8 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { @@ -410,14 +363,11 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -921,24 +871,140 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, } } -static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0, - const CONV_BUF_TYPE *src1, - const __m128i *m, - const __m128i *v_round_offset, - const __m128i *v_maxval, int round_bits) { - const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); - const __m128i s0 = xx_loadl_64(src0); - const __m128i s1 = xx_loadl_64(src1); - const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); - const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); - const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); - const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS); - const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset); - const __m128i res_d = xx_roundn_epi32(res_c, round_bits); - const __m128i res_e = _mm_packs_epi32(res_d, res_d); - const __m128i res = _mm_packus_epi16(res_e, res_e); - - xx_storel_32(dst, res); +static INLINE void blend_a64_d16_mask_w16_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset, + const __m128i *v_maxval, int shift) { + const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0); + const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1); + const __m128i s0_0 = xx_loadu_128(src0); + const __m128i s0_1 = xx_loadu_128(src0 + 8); + const __m128i s1_0 = xx_loadu_128(src1); + const __m128i s1_1 = xx_loadu_128(src1 + 8); + __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0), + _mm_unpacklo_epi16(*m0, max_minus_m0)); + __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0), + _mm_unpackhi_epi16(*m0, max_minus_m0)); + __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1), + _mm_unpacklo_epi16(*m1, max_minus_m1)); + __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1), + _mm_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift); + const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi); + const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi); + const __m128i res = _mm_packus_epi16(res0, res1); + + _mm_storeu_si128((__m128i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m = xx_loadu_128(mask + j); + const __m128i m0 = _mm_cvtepu8_epi16(m); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); + const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); + + const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); + const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); + const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); + const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); + const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); + const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b); + const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b); + const __m128i m0 = _mm_avg_epu16(m0_ac, zeros); + const __m128i m1 = _mm_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m128i m0 = _mm_cvtepu8_epi16(m_ac); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } } void aom_lowbd_blend_a64_d16_mask_sse4_1( @@ -947,12 +1013,15 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1( const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params) { const int bd = 8; - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const int round_offset = (1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1)); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); @@ -961,69 +1030,80 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1( assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i v_ro_a = xx_loadl_32(&round_offset); - const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0); - const __m128i one_w = _mm_set1_epi16(1); - const __m128i one_b = _mm_set1_epi8(1); - const __m128i two_w = _mm_set1_epi16(2); + const __m128i v_round_offset = _mm_set1_epi32(round_offset); if (subw == 0 && subh == 0) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]); - const __m128i m = _mm_cvtepu8_epi16(m0); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } + } else if (subw == 1 && subh == 1) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m_i0 = - xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]); - const __m128i m_i1 = - xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]); - const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); - const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b); - const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd); - const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); - const __m128i m = _mm_srli_epi16(m_acbd_2, 2); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } } else if (subw == 1 && subh == 0) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]); - const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); - const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w); - const __m128i m = _mm_srli_epi16(m_ac_1, 1); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } } else { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]); - const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]); - const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1); - const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b); - const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w); - const __m128i m = _mm_srli_epi16(m_ac_1, 1); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } } } diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c index 59506bdfe..064910232 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -39,7 +39,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); @@ -64,7 +64,7 @@ static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); @@ -90,9 +90,9 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); for (c = 0; c < w; c += 16) { - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w); + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w); const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w); + blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h new file mode 100644 index 000000000..c071fdcfc --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#include <smmintrin.h> // SSE4.1 + +#include <assert.h> + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w4_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadl_64(src0); + const __m128i s1 = xx_loadl_64(src1); + const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); + const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); + const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); + const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); + const __m128i res_d = _mm_srai_epi32(res_c, shift); + const __m128i res_e = _mm_packs_epi32(res_d, res_d); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + xx_storel_32(dst, res); +} + +static INLINE void blend_a64_d16_mask_w8_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadu_128(src0); + const __m128i s1 = xx_loadu_128(src1); + __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), + _mm_unpacklo_epi16(*m, max_minus_m)); + __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), + _mm_unpackhi_epi16(*m, max_minus_m)); + res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); + res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); + const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + _mm_storel_epi64((__m128i *)(dst), res); +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_32(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_64(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_i1 = xx_loadu_128(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h index 4880438bc..8d9b32510 100644 --- a/third_party/aom/aom_dsp/x86/blend_sse4.h +++ b/third_party/aom/aom_dsp/x86/blend_sse4.h @@ -9,42 +9,44 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_BLEND_SSE4_H_ -#define AOM_DSP_X86_BLEND_SSE4_H_ +#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_SSE4_H_ #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" +static const uint8_t g_blend_a64_mask_shuffle[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, +}; ////////////////////////////////////////////////////////////////////////////// // Common kernels ////////////////////////////////////////////////////////////////////////////// static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i *v_m0_w, const __m128i *v_m1_w) { const __m128i v_s0_b = xx_loadl_32(src0); const __m128i v_s1_b = xx_loadl_32(src1); const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); - + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); return v_res_w; } static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i *v_m0_w, const __m128i *v_m1_w) { const __m128i v_s0_b = xx_loadl_64(src0); const __m128i v_s1_b = xx_loadl_64(src1); const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); @@ -53,6 +55,51 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, return v_res_w; } +static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadu_128(src0); + const __m128i v_s1_b = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), + _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, const __m128i v_m0_w, const __m128i v_m1_w); @@ -141,4 +188,4 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, return v_res_w; } -#endif // AOM_DSP_X86_BLEND_SSE4_H_ +#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h index 3f46420dd..96fe4ebb6 100644 --- a/third_party/aom/aom_dsp/x86/common_avx2.h +++ b/third_party/aom/aom_dsp/x86/common_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_COMMON_AVX2_H -#define AOM_DSP_X86_COMMON_AVX2_H +#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_COMMON_AVX2_H_ #include <immintrin.h> @@ -144,4 +144,4 @@ static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); } -#endif +#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h index 36fb1963a..3e19682cd 100644 --- a/third_party/aom/aom_dsp/x86/convolve.h +++ b/third_party/aom/aom_dsp/x86/convolve.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_CONVOLVE_H_ -#define AOM_DSP_X86_CONVOLVE_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_H_ #include <assert.h> @@ -17,7 +17,6 @@ #include "aom/aom_integer.h" #include "aom_ports/mem.h" -#include "aom_dsp/aom_convolve.h" typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, @@ -34,7 +33,30 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, (void)y_step_q4; \ assert((-128 <= filter[3]) && (filter[3] <= 127)); \ assert(step_q4 == 16); \ - if (filter[0] | filter[1] | filter[2]) { \ + if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ + (filter[2] | filter[5])) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else if (filter[0] | filter[1] | filter[2]) { \ while (w >= 16) { \ aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter); \ @@ -153,4 +175,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, } \ } -#endif // AOM_DSP_X86_CONVOLVE_H_ +#endif // AOM_AOM_DSP_X86_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h index 72fabd236..30253f65c 100644 --- a/third_party/aom/aom_dsp/x86/convolve_avx2.h +++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_ -#define AOM_DSP_X86_CONVOLVE_AVX2_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ // filters for 16 DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { @@ -29,6 +29,11 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, }; +DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, +}; + static INLINE void prepare_coeffs_lowbd( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { @@ -191,4 +196,4 @@ static INLINE __m256i highbd_convolve_rounding( return res_round; } -#endif +#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h index e80c5872f..707bd2d78 100644 --- a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h +++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ -#define _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ // Note: // This header file should be put below any x86 intrinsics head file @@ -28,4 +28,4 @@ static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, _mm_store_si128((__m128i *)dst, d); } -#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ +#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h index 399df5d6d..445d04b10 100644 --- a/third_party/aom/aom_dsp/x86/convolve_sse2.h +++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_CONVOLVE_SSE2_H_ -#define AOM_DSP_X86_CONVOLVE_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ // Note: // This header file should be put below any x86 intrinsics head file @@ -118,4 +118,4 @@ static INLINE __m128i highbd_convolve_rounding_sse2( return res_round; } -#endif +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h index d48c25667..6b8388d84 100644 --- a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h +++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_ -#define _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ // Note: // This header file should be put below any x86 intrinsics head file @@ -50,4 +50,4 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, return res; } -#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h index 12ccf7f26..260d8dd58 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_ -#define AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ #ifdef __cplusplus extern "C" { @@ -152,4 +152,4 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { } // extern "C" #endif -#endif // AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm deleted file mode 100644 index 99f17ebdf..000000000 --- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm +++ /dev/null @@ -1,351 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -SECTION .text - -;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref, -; int ref_stride, -; unsigned char *src, -; int src_stride, -; unsigned int height, -; int *sum, -; unsigned int *sumsquared) -global sym(aom_half_horiz_vert_variance16x_h_sse2) PRIVATE -sym(aom_half_horiz_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref - - mov rdi, arg(2) ;src - movsxd rcx, dword ptr arg(4) ;height - movsxd rax, dword ptr arg(1) ;ref_stride - movsxd rdx, dword ptr arg(3) ;src_stride - - pxor xmm0, xmm0 ; - - movdqu xmm5, XMMWORD PTR [rsi] - movdqu xmm3, XMMWORD PTR [rsi+1] - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 - - lea rsi, [rsi + rax] - -aom_half_horiz_vert_variance16x_h_1: - movdqu xmm1, XMMWORD PTR [rsi] ; - movdqu xmm2, XMMWORD PTR [rsi+1] ; - pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 - - pavgb xmm5, xmm1 ; xmm = vertical average of the above - - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm4, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - psubw xmm5, xmm3 ; xmm5 -= xmm3 - - movq xmm3, QWORD PTR [rdi+8] - punpcklbw xmm3, xmm0 - psubw xmm4, xmm3 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm1 ; save xmm1 for use on the next row - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz aom_half_horiz_vert_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_half_vert_variance16x_h_sse2(unsigned char *ref, -; int ref_stride, -; unsigned char *src, -; int src_stride, -; unsigned int height, -; int *sum, -; unsigned int *sumsquared) -global sym(aom_half_vert_variance16x_h_sse2) PRIVATE -sym(aom_half_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref - - mov rdi, arg(2) ;src - movsxd rcx, dword ptr arg(4) ;height - movsxd rax, dword ptr arg(1) ;ref_stride - movsxd rdx, dword ptr arg(3) ;src_stride - - movdqu xmm5, XMMWORD PTR [rsi] - lea rsi, [rsi + rax ] - pxor xmm0, xmm0 - -aom_half_vert_variance16x_h_1: - movdqu xmm3, XMMWORD PTR [rsi] - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 - punpckhbw xmm4, xmm0 - - movq xmm2, QWORD PTR [rdi] - punpcklbw xmm2, xmm0 - psubw xmm5, xmm2 - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - psubw xmm4, xmm2 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm3 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 - jnz aom_half_vert_variance16x_h_1 - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_half_horiz_variance16x_h_sse2(unsigned char *ref, -; int ref_stride -; unsigned char *src, -; int src_stride, -; unsigned int height, -; int *sum, -; unsigned int *sumsquared) -global sym(aom_half_horiz_variance16x_h_sse2) PRIVATE -sym(aom_half_horiz_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref - - mov rdi, arg(2) ;src - movsxd rcx, dword ptr arg(4) ;height - movsxd rax, dword ptr arg(1) ;ref_stride - movsxd rdx, dword ptr arg(3) ;src_stride - - pxor xmm0, xmm0 ; - -aom_half_horiz_variance16x_h_1: - movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 - movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm1, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm1, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - psubw xmm1, xmm2 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm1 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm1, xmm1 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz aom_half_horiz_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -aom_bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c deleted file mode 100644 index 2a018c1cf..000000000 --- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" - -void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref, - int ref_stride, - const unsigned char *src, - int src_stride, unsigned int height, - int *sum, unsigned int *sumsquared); -void aom_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride, - const unsigned char *src, int src_stride, - unsigned int height, int *sum, - unsigned int *sumsquared); -void aom_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride, - const unsigned char *src, int src_stride, - unsigned int height, int *sum, - unsigned int *sumsquared); - -uint32_t aom_variance_halfpixvar16x16_h_sse2(const unsigned char *src, - int src_stride, - const unsigned char *dst, - int dst_stride, uint32_t *sse) { - int xsum0; - unsigned int xxsum0; - - aom_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - assert(xsum0 <= 255 * 16 * 16); - assert(xsum0 >= -255 * 16 * 16); - return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); -} - -uint32_t aom_variance_halfpixvar16x16_v_sse2(const unsigned char *src, - int src_stride, - const unsigned char *dst, - int dst_stride, uint32_t *sse) { - int xsum0; - unsigned int xxsum0; - aom_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0, - &xxsum0); - - *sse = xxsum0; - assert(xsum0 <= 255 * 16 * 16); - assert(xsum0 >= -255 * 16 * 16); - return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); -} - -uint32_t aom_variance_halfpixvar16x16_hv_sse2(const unsigned char *src, - int src_stride, - const unsigned char *dst, - int dst_stride, uint32_t *sse) { - int xsum0; - unsigned int xxsum0; - - aom_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - assert(xsum0 <= 255 * 16 * 16); - assert(xsum0 >= -255 * 16 * 16); - return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); -} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c index 83e0098ba..097e0778f 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -327,6 +327,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, const unsigned char *lt, const unsigned char *thr, int bd) { int i; + const __m128i zero = _mm_setzero_si128(); __m128i blimit, limit, thresh; __m128i t80; get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); @@ -355,13 +356,18 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat2 = _mm_unpacklo_epi64(flat2, flat2); // flat and wide flat calculations - __m128i flat_p[3], flat_q[3], flat_pq[3]; - __m128i flat2_p[6], flat2_q[6]; - __m128i flat2_pq[6]; - { - __m128i work0; + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3], flat_pq[3]; + __m128i flat2_p[6], flat2_q[6]; + __m128i flat2_pq[6]; + __m128i sum_p6, sum_p3; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); + + __m128i work0, work0_0, work0_1, sum_p_0; __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); sum_p = _mm_add_epi16(sum_p, sum_lp); @@ -369,30 +375,23 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( __m128i sum_lq = _mm_srli_si128(sum_lp, 8); __m128i sum_q = _mm_srli_si128(sum_p, 8); - sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); - flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0])); - flat2_q[0] = - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0])); - - flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])); + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0])); flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); - __m128i sum_p6, sum_p3; sum_p6 = _mm_add_epi16(pq[6], pq[6]); sum_p3 = _mm_add_epi16(pq[3], pq[3]); - sum_q = _mm_sub_epi16(sum_p, p[5]); - sum_p = _mm_sub_epi16(sum_p, q[5]); + sum_q = _mm_sub_epi16(sum_p_0, pq[5]); + sum_p = _mm_sub_epi16(sum_p_0, q[5]); - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); - flat2_p[1] = _mm_add_epi16(sum_p, work0); - flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); + work0_1 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); - sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lq = _mm_sub_epi16(sum_lp, pq[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); work0 = _mm_add_epi16(sum_p3, pq[1]); @@ -402,21 +401,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); - flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); - flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); - - sum_p = _mm_sub_epi16(sum_p, q[4]); - sum_q = _mm_sub_epi16(sum_q, p[4]); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); - flat2_p[2] = _mm_add_epi16(sum_p, work0); - flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); - sum_lp = _mm_sub_epi16(sum_lp, q[1]); - sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_lq = _mm_sub_epi16(sum_lq, pq[1]); sum_p3 = _mm_add_epi16(sum_p3, pq[3]); work0 = _mm_add_epi16(sum_p3, pq[2]); @@ -425,54 +411,88 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[3]); - sum_q = _mm_sub_epi16(sum_q, p[3]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); - flat2_p[3] = _mm_add_epi16(sum_p, work0); - flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[2]); - sum_q = _mm_sub_epi16(sum_q, p[2]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); - flat2_p[4] = _mm_add_epi16(sum_p, work0); - flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[1]); - sum_q = _mm_sub_epi16(sum_q, p[1]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); - flat2_p[5] = _mm_add_epi16(sum_p, work0); - flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); - } - - // highbd_filter8 - pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); - pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); - - for (i = 0; i < 3; i++) { - pq[i] = _mm_andnot_si128(flat, pq[i]); - flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); - pq[i] = _mm_or_si128(pq[i], flat_pq[i]); - } - - // highbd_filter16 - for (i = 5; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - pq[i] = _mm_andnot_si128(flat2, pq[i]); - flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); - // get values for when (flat2 && flat && mask) - pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0])); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, pq[3]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, pq[2]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, pq[1]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + } // flat2 + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // highbd_filter8 + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + + for (i = 0; i < 3; i++) { + pq[i] = _mm_andnot_si128(flat, pq[i]); + flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat_pq[i]); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if (flat2_mask) { + for (i = 0; i < 6; i++) { + pq[i] = _mm_andnot_si128(flat2, pq[i]); + flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + } + } + } else { + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); } } @@ -500,6 +520,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, const uint8_t *thr1, int bd) { __m128i blimit, limit, thresh, t80; + const __m128i zero = _mm_setzero_si128(); + get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, &t80); __m128i mask; @@ -512,27 +534,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( __m128i ps[2], qs[2]; highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); // flat and wide flat calculations - __m128i flat_p[3], flat_q[3]; - __m128i flat2_p[6], flat2_q[6]; - { + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); - __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); + __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); - sum_p = _mm_add_epi16(sum_p, sum_lp); + sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp); __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); sum_q = _mm_add_epi16(sum_q, sum_lq); - sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - flat2_p[0] = _mm_srli_epi16( - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), - _mm_add_epi16(p[1], q[0]))), - 4); - flat2_q[0] = _mm_srli_epi16( - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), - _mm_add_epi16(p[0], q[1]))), - 4); flat_p[0] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); flat_q[0] = @@ -541,117 +558,160 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); - sum_q = _mm_sub_epi16(sum_p, p[5]); - sum_p = _mm_sub_epi16(sum_p, q[5]); - flat2_p[1] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), - 4); - flat2_q[1] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), - 4); + + sum_q = _mm_sub_epi16(sum_p_0, p[5]); + __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]); + sum_lq = _mm_sub_epi16(sum_lp, p[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); flat_p[1] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); flat_q[1] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p3 = _mm_add_epi16(sum_p3, p[3]); - sum_q3 = _mm_add_epi16(sum_q3, q[3]); - sum_p = _mm_sub_epi16(sum_p, q[4]); - sum_q = _mm_sub_epi16(sum_q, p[4]); - flat2_p[2] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), - 4); - flat2_q[2] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), - 4); + sum_lp = _mm_sub_epi16(sum_lp, q[1]); sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); flat_p[2] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); flat_q[2] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[3]); - sum_q = _mm_sub_epi16(sum_q, p[3]); - flat2_p[3] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), - 4); - flat2_q[3] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[2]); - sum_q = _mm_sub_epi16(sum_q, p[2]); - flat2_p[4] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), - 4); - flat2_q[4] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[1]); - sum_q = _mm_sub_epi16(sum_q, p[1]); - flat2_p[5] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), - 4); - flat2_q[5] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), - 4); - } - // highbd_filter8 - p[2] = _mm_andnot_si128(flat, p[2]); - // p2 remains unchanged if !(flat && mask) - flat_p[2] = _mm_and_si128(flat, flat_p[2]); - // when (flat && mask) - p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values - q[2] = _mm_andnot_si128(flat, q[2]); - flat_q[2] = _mm_and_si128(flat, flat_q[2]); - q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values - int i; - for (i = 1; i >= 0; i--) { - ps[i] = _mm_andnot_si128(flat, ps[i]); - flat_p[i] = _mm_and_si128(flat, flat_p[i]); - p[i] = _mm_or_si128(ps[i], flat_p[i]); - qs[i] = _mm_andnot_si128(flat, qs[i]); - flat_q[i] = _mm_and_si128(flat, flat_q[i]); - q[i] = _mm_or_si128(qs[i], flat_q[i]); - } - // highbd_filter16 - for (i = 5; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm_andnot_si128(flat2, p[i]); - flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - q[i] = _mm_andnot_si128(flat2, q[i]); - flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); - q[i] = _mm_or_si128(q[i], flat2_q[i]); + + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), + _mm_add_epi16(p[1], q[0]))), + 4); + flat2_q[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), + _mm_add_epi16(p[0], q[1]))), + 4); + + flat2_p[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), + 4); + flat2_q[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); + flat2_p[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), + 4); + flat2_q[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + flat2_p[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), + 4); + flat2_q[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + flat2_p[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), + 4); + flat2_q[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + flat2_p[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), + 4); + flat2_q[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), + 4); + } + // highbd_filter8 + int i; + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + p[2] = _mm_andnot_si128(flat, p[2]); + // p2 remains unchanged if !(flat && mask) + flat_p[2] = _mm_and_si128(flat, flat_p[2]); + // when (flat && mask) + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values + + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + // highbd_filter16 + if (flat2_mask) { + for (i = 0; i < 6; i++) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + } + } + } else { + p[0] = ps[0]; + q[0] = qs[0]; + p[1] = ps[1]; + q[1] = qs[1]; } } @@ -696,6 +756,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + // flat_mask flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); @@ -707,53 +770,56 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); - { - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; // op1 - workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), - _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), - *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 - workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); - workp_shft0 = _mm_add_epi16( - workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 // op0 - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 - workp_a = - _mm_add_epi16(workp_a, - workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 - - flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3); + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + flat_p1p0 = _mm_srli_epi16(workp_b, 3); // oq0 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), - *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 - workp_b = _mm_add_epi16(*q1, *q2); - workp_shft0 = _mm_add_epi16( - workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]), + pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); // oq1 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), - *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]), + pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4 workp_b = _mm_add_epi16(*q2, *q2); - workp_shft1 = _mm_add_epi16( - workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 - flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); - } - // lp filter - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + flat_q0q1 = _mm_srli_epi16(workp_a, 3); - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } } static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( @@ -797,6 +863,17 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + // flat_mask flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); flat = _mm_max_epi16(flat, work); @@ -806,7 +883,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // flat & mask - { + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b, workp_shft0, workp_shft1; // op1 @@ -842,33 +921,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( workp_shft1 = _mm_add_epi16( workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 oq1 = _mm_srli_epi16(workp_shft1, 3); - } - // lp filter - __m128i ps[2], qs[2], p[2], q[2]; - { - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - // filter_mask and hev_mask - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); - } - - qs[0] = _mm_andnot_si128(flat, qs[0]); - oq0 = _mm_and_si128(flat, oq0); - *q0 = _mm_or_si128(qs[0], oq0); - qs[1] = _mm_andnot_si128(flat, qs[1]); - oq1 = _mm_and_si128(flat, oq1); - *q1 = _mm_or_si128(qs[1], oq1); - - ps[0] = _mm_andnot_si128(flat, ps[0]); - op0 = _mm_and_si128(flat, op0); - *p0 = _mm_or_si128(ps[0], op0); - - ps[1] = _mm_andnot_si128(flat, ps[1]); - op1 = _mm_and_si128(flat, op1); - *p1 = _mm_or_si128(ps[1], op1); + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; + } } void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, @@ -926,7 +1000,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( __m128i mask, hev, flat; __m128i pq[4]; __m128i p1p0, q1q0, ps1ps0, qs1qs0; - __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1; + __m128i work_a, opq2, flat_p1p0, flat_q0q1; pq[0] = _mm_unpacklo_epi64(*p0, *q0); pq[1] = _mm_unpacklo_epi64(*p1, *q1); @@ -944,6 +1018,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + // flat_mask4 flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); flat = _mm_max_epi16(abs_p1p0, flat); @@ -956,15 +1033,15 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); - { - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1; // Added before shift for rounding part of ROUND_POWER_OF_TWO // o*p2 workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); - op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + workp_c = _mm_add_epi16(workp_a, workp_c); // o*p1 workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); @@ -992,27 +1069,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( // oq2 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); - oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - } + workp_a = _mm_add_epi16(workp_a, workp_b); + opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3); - // lp filter - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); - - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - work_a = _mm_andnot_si128(flat, *q2); - *q2 = _mm_and_si128(flat, oq2); - *q2 = _mm_or_si128(work_a, *q2); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); - work_a = _mm_andnot_si128(flat, *p2); - *p2 = _mm_and_si128(flat, op2); - *p2 = _mm_or_si128(work_a, *p2); + work_a = _mm_andnot_si128(flat, pq[2]); + *p2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_a, *p2); + *q2 = _mm_srli_si128(*p2, 8); + } } static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( @@ -1058,17 +1130,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); flat = _mm_max_epi16(work1, flat); work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); flat = _mm_max_epi16(work0, flat); flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); - flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // flat & mask - { + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b; // Added before shift for rounding part of ROUND_POWER_OF_TWO @@ -1101,42 +1184,36 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - } - // lp filter - __m128i ps[2], qs[2], p[2], q[2]; - { - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - // filter_mask and hev_mask - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); + + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; } - - qs[0] = _mm_andnot_si128(flat, qs[0]); - oq0 = _mm_and_si128(flat, oq0); - *q0 = _mm_or_si128(qs[0], oq0); - - qs[1] = _mm_andnot_si128(flat, qs[1]); - oq1 = _mm_and_si128(flat, oq1); - *q1 = _mm_or_si128(qs[1], oq1); - - ps[0] = _mm_andnot_si128(flat, ps[0]); - op0 = _mm_and_si128(flat, op0); - *p0 = _mm_or_si128(ps[0], op0); - - ps[1] = _mm_andnot_si128(flat, ps[1]); - op1 = _mm_and_si128(flat, op1); - *p1 = _mm_or_si128(ps[1], op1); - - work_a = _mm_andnot_si128(flat, *q2); - *q2 = _mm_and_si128(flat, oq2); - *q2 = _mm_or_si128(work_a, *q2); - - work_a = _mm_andnot_si128(flat, *p2); - *p2 = _mm_and_si128(flat, op2); - *p2 = _mm_or_si128(work_a, *p2); } void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c index dea113a29..b9689202a 100644 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c @@ -110,7 +110,7 @@ static INLINE void quantize(const __m256i *qp, __m256i *c, } void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -120,12 +120,23 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, (void)scan; const unsigned int step = 8; - if (LIKELY(!skip_block)) { - __m256i qp[5], coeff; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp); - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + __m256i qp[5], coeff; + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); - __m256i eob = _mm256_setzero_si256(); + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; @@ -133,40 +144,17 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr += step; iscan += step; n_coeffs -= step; - - update_qp(qp); - - while (n_coeffs > 0) { - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); - quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan += step; - n_coeffs -= step; - } - { - __m256i eob_s; - eob_s = _mm256_shuffle_epi32(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 1); - eob = _mm256_max_epi16(eob, eob_s); - const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), - _mm256_extractf128_si256(eob, 1)); - *eob_ptr = _mm_extract_epi16(final_eob, 0); - } - } else { - do { - const __m256i zero = _mm256_setzero_si256(); - _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero); - _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero); - qcoeff_ptr += step; - dqcoeff_ptr += step; - n_coeffs -= step; - } while (n_coeffs > 0); - *eob_ptr = 0; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); } } diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c index 5570ca5b7..58e5f98e5 100644 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -16,7 +16,7 @@ #include "aom_ports/mem.h" void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -41,50 +41,48 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = ((int)count / 4) - 1; i >= 0; i--) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (test == 0xffff) - non_zero_regs--; - else - break; - } + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } - // Quantization pass: - for (i = 0; i < non_zero_regs; i++) { - __m128i coeffs, coeffs_sign, tmp1, tmp2; - int test; - int abs_coeff[4]; - int coeff_sign[4]; - - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); - tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); - tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); - tmp1 = _mm_or_si128(tmp1, tmp2); - test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i *)abs_coeff, coeffs); - _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); - - for (j = 0; j < 4; j++) { - if (test & (1 << (4 * j))) { - int k = 4 * i + j; - const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; - const uint32_t abs_qcoeff = - (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; - dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; - } + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } @@ -92,8 +90,8 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void aom_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -116,38 +114,35 @@ void aom_highbd_quantize_b_32x32_sse2( memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs / 4; i++) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; - } + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = idx_arr[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; - } + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } *eob_ptr = eob + 1; } diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c new file mode 100644 index 000000000..9b1b4c9de --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> // AVX2 + +#include "config/aom_dsp_rtcd.h" + +typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + for (int i = 0; i < 8; i += 2) { + const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src); + const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref); + const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride)); + __m256i v_p_a = _mm256_castsi128_si256(v_p_a0); + __m256i v_p_b = _mm256_castsi128_si256(v_p_b0); + v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1); + v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride * 2; + ref += ref_stride * 2; + } + __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d)); + __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1)); + __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + for (int i = 0; i < 16; ++i) { + const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src); + const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride; + ref += ref_stride; + } + __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_avx2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(128, 128, 16, 14); +VAR_FN(128, 64, 16, 13); +VAR_FN(64, 128, 16, 13); +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); +VAR_FN(16, 4, 16, 6); +VAR_FN(8, 32, 8, 8); +VAR_FN(32, 8, 8, 8); +VAR_FN(16, 64, 16, 10); +VAR_FN(64, 16, 16, 10); + +#undef VAR_FN diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index 131c16aa9..47b052abc 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -179,6 +179,9 @@ HIGH_GET_VAR(8); return (var >= 0) ? (uint32_t)var : 0; \ } +VAR_FN(128, 128, 16, 14); +VAR_FN(128, 64, 16, 13); +VAR_FN(64, 128, 16, 13); VAR_FN(64, 64, 16, 12); VAR_FN(64, 32, 16, 11); VAR_FN(32, 64, 16, 11); @@ -590,10 +593,10 @@ FNS(sse2); void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, - uint16_t *comp_pred, int width, int height, + uint8_t *comp_pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref8, int ref_stride, - int bd) { + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; @@ -606,8 +609,6 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, if (is_scaled) { // Note: This is mostly a copy from the >=8X8 case in // build_inter_predictors() function, with some small tweaks. - uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); - // Some assumptions. const int plane = 0; @@ -661,7 +662,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); const InterpFilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); @@ -677,10 +678,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, } const InterpFilterParams *filter = - av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); if (width >= 8) { int i; assert(!(width & 7)); @@ -711,13 +715,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, kernel, 16, NULL, -1, width, height, bd); + aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, + NULL, -1, width, height, bd); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, NULL, -1, kernel, 16, width, height, bd); + aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); } else { DECLARE_ALIGNED(16, uint16_t, temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); @@ -734,30 +738,29 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, intermediate_height, bd); aom_highbd_convolve8_vert( CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), - MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, - 16, width, height, bd); + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); } } void aom_highbd_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd) { - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - int n; - int i; - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + int ref_stride, int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); + bd, subpel_search); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ assert(!(width * height & 7)); - n = width * height >> 3; - for (i = 0; i < n; i++) { - __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred); + int n = width * height >> 3; + for (int i = 0; i < n; i++) { + __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16); __m128i p0 = _mm_loadu_si128((const __m128i *)pred); - _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu16(s0, p0)); - comp_pred += 8; + _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0)); + comp_pred16 += 8; pred += 8; } } @@ -777,7 +780,7 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, xx_storeu_128(result, shift); } -void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, +void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, @@ -792,6 +795,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, _mm_set_epi16(round, round, round, round, round, round, round, round); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); if (width >= 8) { // Read 8 pixels one row at a time @@ -830,15 +834,16 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) { + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, + int subpel_search) { uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); int n; int i; - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); + bd, subpel_search); assert(!(width * height & 7)); n = width * height >> 3; @@ -850,13 +855,14 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( const __m128i r = _mm_set_epi16(round, round, round, round, round, round, round, round); + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < n; i++) { - __m128i p0 = xx_loadu_128(comp_pred); + __m128i p0 = xx_loadu_128(comp_pred16); __m128i p1 = xx_loadu_128(pred); - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); - comp_pred += 8; + comp_pred16 += 8; pred += 8; } } diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c index 6c247a91b..df5449a9d 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c @@ -168,8 +168,8 @@ uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); - aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), - 4); + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); @@ -188,8 +188,8 @@ uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); - aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), - 4); + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); @@ -208,8 +208,8 @@ uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); - aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), - 4); + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c index eaf1f347b..f9a41a210 100644 --- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c +++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c @@ -120,11 +120,11 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param) { + int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { int n; int i; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ assert(!(width * height & 15)); n = width * height >> 4; diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c deleted file mode 100644 index 18862dd3e..000000000 --- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c +++ /dev/null @@ -1,916 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> /* AVX2 */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_ports/mem.h" - -void aom_lpf_horizontal_16_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; - __m128i abs_p1p0; - - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); - - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); - q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); - p0q0 = _mm_shuffle_epi32(q0p0, 78); - - { - __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; - abs_p1p0 = - _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); - ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - abs_p0q0 = - _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); - abs_p1q1 = - _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), - _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); - __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); - __m128i qs0 = _mm_xor_si128(p0q0, t80); - __m128i qs1 = _mm_xor_si128(p1q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; - __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - - filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, qs0ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (aom_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 0xB); - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 0xB); - - /* Filter1 >> 3 */ - filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); - qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - - /* filt >> 1 */ - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), - filt); - filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); - qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); - // loopfilter done - - { - __m128i work; - flat = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), - _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); - q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); - - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); - q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); - - flat2 = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), - _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); - - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); - q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); - - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), - _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); - - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; - __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - p7_16 = _mm_unpacklo_epi8(q7p7, zero); - p6_16 = _mm_unpacklo_epi8(q6p6, zero); - p5_16 = _mm_unpacklo_epi8(q5p5, zero); - p4_16 = _mm_unpacklo_epi8(q4p4, zero); - p3_16 = _mm_unpacklo_epi8(q3p3, zero); - p2_16 = _mm_unpacklo_epi8(q2p2, zero); - p1_16 = _mm_unpacklo_epi8(q1p1, zero); - p0_16 = _mm_unpacklo_epi8(q0p0, zero); - q0_16 = _mm_unpackhi_epi8(q0p0, zero); - q1_16 = _mm_unpackhi_epi8(q1p1, zero); - q2_16 = _mm_unpackhi_epi8(q2p2, zero); - q3_16 = _mm_unpackhi_epi8(q3p3, zero); - q4_16 = _mm_unpackhi_epi8(q4p4, zero); - q5_16 = _mm_unpackhi_epi8(q5p5, zero); - q6_16 = _mm_unpackhi_epi8(q6p6, zero); - q7_16 = _mm_unpackhi_epi8(q7p7, zero); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), - _mm_add_epi16(p4_16, p3_16)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), - _mm_add_epi16(q4_16, q3_16)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = - _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16( - four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); - flat2_q0p0 = _mm_packus_epi16(res_p, res_q); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); - - flat_q0p0 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(p7_16, p7_16); - sum_q7 = _mm_add_epi16(q7_16, q7_16); - sum_p3 = _mm_add_epi16(p3_16, p3_16); - sum_q3 = _mm_add_epi16(q3_16, q3_16); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); - flat2_q1p1 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); - flat_q1p1 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); - flat_q2p2 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); - flat2_q6p6 = _mm_packus_epi16(res_p, res_q); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); - - q2p2 = _mm_andnot_si128(flat, q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - q2p2 = _mm_or_si128(q2p2, flat_q2p2); - - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); - - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); - - q6p6 = _mm_andnot_si128(flat2, q6p6); - flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); - q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); - - q5p5 = _mm_andnot_si128(flat2, q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); - - q4p4 = _mm_andnot_si128(flat2, q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); - - q3p3 = _mm_andnot_si128(flat2, q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); - - q2p2 = _mm_andnot_si128(flat2, q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); - - q1p1 = _mm_andnot_si128(flat2, q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); - - q0p0 = _mm_andnot_si128(flat2, q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); - } -} - -DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { - 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, - 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 -}; - -void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - __m128i p7, p6, p5; - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; - __m128i q5, q6, q7; - __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, - p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; - - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); - - p256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); - p256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); - p256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); - p256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); - p256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); - q256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); - q256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); - q256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); - q256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); - q256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); - - p4 = _mm256_castsi256_si128(p256_4); - p3 = _mm256_castsi256_si128(p256_3); - p2 = _mm256_castsi256_si128(p256_2); - p1 = _mm256_castsi256_si128(p256_1); - p0 = _mm256_castsi256_si128(p256_0); - q0 = _mm256_castsi256_si128(q256_0); - q1 = _mm256_castsi256_si128(q256_1); - q2 = _mm256_castsi256_si128(q256_2); - q3 = _mm256_castsi256_si128(q256_3); - q4 = _mm256_castsi256_si128(q256_4); - - { - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); - __m128i work; - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - __m128i ps1 = _mm_xor_si128(p1, t80); - __m128i ps0 = _mm_xor_si128(p0, t80); - __m128i qs0 = _mm_xor_si128(q0, t80); - __m128i qs1 = _mm_xor_si128(q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, - flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, - flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (aom_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - /* Filter1 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - - /* Filter2 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - - /* filt >> 1 */ - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - // loopfilter done - - { - __m128i work; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - p256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); - q256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); - p5 = _mm256_castsi256_si128(p256_5); - q5 = _mm256_castsi256_si128(q256_5); - flat2 = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), - _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); - - flat2 = _mm_max_epu8(work, flat2); - p256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); - q256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); - p6 = _mm256_castsi256_si128(p256_6); - q6 = _mm256_castsi256_si128(q256_6); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), - _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); - - flat2 = _mm_max_epu8(work, flat2); - - p256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); - q256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); - p7 = _mm256_castsi256_si128(p256_7); - q7 = _mm256_castsi256_si128(q256_7); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), - _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); - - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m256i eight = _mm256_set1_epi16(8); - const __m256i four = _mm256_set1_epi16(4); - __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, - pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - const __m256i filter = - _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); - p256_7 = _mm256_shuffle_epi8(p256_7, filter); - p256_6 = _mm256_shuffle_epi8(p256_6, filter); - p256_5 = _mm256_shuffle_epi8(p256_5, filter); - p256_4 = _mm256_shuffle_epi8(p256_4, filter); - p256_3 = _mm256_shuffle_epi8(p256_3, filter); - p256_2 = _mm256_shuffle_epi8(p256_2, filter); - p256_1 = _mm256_shuffle_epi8(p256_1, filter); - p256_0 = _mm256_shuffle_epi8(p256_0, filter); - q256_0 = _mm256_shuffle_epi8(q256_0, filter); - q256_1 = _mm256_shuffle_epi8(q256_1, filter); - q256_2 = _mm256_shuffle_epi8(q256_2, filter); - q256_3 = _mm256_shuffle_epi8(q256_3, filter); - q256_4 = _mm256_shuffle_epi8(q256_4, filter); - q256_5 = _mm256_shuffle_epi8(q256_5, filter); - q256_6 = _mm256_shuffle_epi8(q256_6, filter); - q256_7 = _mm256_shuffle_epi8(q256_7, filter); - - pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), - _mm256_add_epi16(p256_4, p256_3)); - pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), - _mm256_add_epi16(q256_4, q256_3)); - - pixetFilter_p2p1p0 = - _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); - pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = - _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); - pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - - pixelFilter_p = _mm256_add_epi16( - eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); - - pixetFilter_p2p1p0 = _mm256_add_epi16( - four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4); - - flat2_p0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4); - - flat2_q0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - res_p = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(p256_3, p256_0)), - 3); - - flat_p0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(q256_3, q256_0)), - 3); - - flat_q0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(p256_7, p256_7); - - sum_q7 = _mm256_add_epi16(q256_7, q256_7); - - sum_p3 = _mm256_add_epi16(p256_3, p256_3); - - sum_q3 = _mm256_add_epi16(q256_3, q256_3); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4); - - flat2_p1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4); - - flat2_q1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); - - pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); - - res_p = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(sum_p3, p256_1)), - 3); - - flat_p1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, - _mm256_add_epi16(sum_q3, q256_1)), - 3); - - flat_q1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - sum_p3 = _mm256_add_epi16(sum_p3, p256_3); - - sum_q3 = _mm256_add_epi16(sum_q3, q256_3); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4); - - flat2_p2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4); - - flat2_q2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); - - pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); - - res_p = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(sum_p3, p256_2)), - 3); - - flat_p2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, - _mm256_add_epi16(sum_q3, q256_2)), - 3); - - flat_q2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4); - - flat2_p3 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4); - - flat2_q3 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4); - - flat2_p4 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4); - - flat2_q4 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4); - - flat2_p5 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4); - - flat2_q5 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4); - - flat2_p6 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4); - - flat2_q6 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - } - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - p2 = _mm_andnot_si128(flat, p2); - flat_p2 = _mm_and_si128(flat, flat_p2); - p2 = _mm_or_si128(flat_p2, p2); - - p1 = _mm_andnot_si128(flat, ps1); - flat_p1 = _mm_and_si128(flat, flat_p1); - p1 = _mm_or_si128(flat_p1, p1); - - p0 = _mm_andnot_si128(flat, ps0); - flat_p0 = _mm_and_si128(flat, flat_p0); - p0 = _mm_or_si128(flat_p0, p0); - - q0 = _mm_andnot_si128(flat, qs0); - flat_q0 = _mm_and_si128(flat, flat_q0); - q0 = _mm_or_si128(flat_q0, q0); - - q1 = _mm_andnot_si128(flat, qs1); - flat_q1 = _mm_and_si128(flat, flat_q1); - q1 = _mm_or_si128(flat_q1, q1); - - q2 = _mm_andnot_si128(flat, q2); - flat_q2 = _mm_and_si128(flat, flat_q2); - q2 = _mm_or_si128(flat_q2, q2); - - p6 = _mm_andnot_si128(flat2, p6); - flat2_p6 = _mm_and_si128(flat2, flat2_p6); - p6 = _mm_or_si128(flat2_p6, p6); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); - - p5 = _mm_andnot_si128(flat2, p5); - flat2_p5 = _mm_and_si128(flat2, flat2_p5); - p5 = _mm_or_si128(flat2_p5, p5); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); - - p4 = _mm_andnot_si128(flat2, p4); - flat2_p4 = _mm_and_si128(flat2, flat2_p4); - p4 = _mm_or_si128(flat2_p4, p4); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); - - p3 = _mm_andnot_si128(flat2, p3); - flat2_p3 = _mm_and_si128(flat2, flat2_p3); - p3 = _mm_or_si128(flat2_p3, p3); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); - - p2 = _mm_andnot_si128(flat2, p2); - flat2_p2 = _mm_and_si128(flat2, flat2_p2); - p2 = _mm_or_si128(flat2_p2, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - - p1 = _mm_andnot_si128(flat2, p1); - flat2_p1 = _mm_and_si128(flat2, flat2_p1); - p1 = _mm_or_si128(flat2_p1, p1); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - - p0 = _mm_andnot_si128(flat2, p0); - flat2_p0 = _mm_and_si128(flat2, flat2_p0); - p0 = _mm_or_si128(flat2_p0, p0); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - - q0 = _mm_andnot_si128(flat2, q0); - flat2_q0 = _mm_and_si128(flat2, flat2_q0); - q0 = _mm_or_si128(flat2_q0, q0); - _mm_storeu_si128((__m128i *)(s - 0 * p), q0); - - q1 = _mm_andnot_si128(flat2, q1); - flat2_q1 = _mm_and_si128(flat2, flat2_q1); - q1 = _mm_or_si128(flat2_q1, q1); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - - q2 = _mm_andnot_si128(flat2, q2); - flat2_q2 = _mm_and_si128(flat2, flat2_q2); - q2 = _mm_or_si128(flat2_q2, q2); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); - - q3 = _mm_andnot_si128(flat2, q3); - flat2_q3 = _mm_and_si128(flat2, flat2_q3); - q3 = _mm_or_si128(flat2_q3, q3); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); - - q4 = _mm_andnot_si128(flat2, q4); - flat2_q4 = _mm_and_si128(flat2, flat2_q4); - q4 = _mm_or_si128(flat2_q4, q4); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); - - q5 = _mm_andnot_si128(flat2, q5); - flat2_q5 = _mm_and_si128(flat2, flat2_q5); - q5 = _mm_or_si128(flat2_q5, q5); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); - - q6 = _mm_andnot_si128(flat2, q6); - flat2_q6 = _mm_and_si128(flat2, flat2_q6); - q6 = _mm_or_si128(flat2_q6, q6); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); - } - _mm256_zeroupper(); -} diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c index f1eac233b..9d88b5e49 100644 --- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -249,6 +249,63 @@ static INLINE void transpose16x8_8x16_sse2( *d7 = _mm_unpackhi_epi64(w7, w15); } +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them independently while flipping the second matrix horizontaly Used for 14 +// taps filter pq pairs inverse +static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *pq0, __m128i *pq1, + __m128i *pq2, __m128i *pq3) { + __m128i w10, w11, w12, w13; + __m128i w0, w1, w2, w3, w4, w5; + __m128i d0, d1, d2, d3; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w10 = _mm_unpacklo_epi8( + *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13 + w11 = _mm_unpacklo_epi8( + *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33 + w12 = _mm_unpacklo_epi8( + *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53 + w13 = _mm_unpacklo_epi8( + *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73 + + w4 = _mm_unpackhi_epi16( + w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpackhi_epi16( + w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + *pq0 = _mm_unpacklo_epi64(d0, d1); // pq + *pq1 = _mm_unpackhi_epi64(d0, d1); // pq + *pq2 = _mm_unpacklo_epi64(d2, d3); // pq + *pq3 = _mm_unpackhi_epi64(d2, d3); // pq +} + static INLINE void transpose8x16_16x8_sse2( __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, @@ -300,9 +357,120 @@ static INLINE void transpose8x16_16x8_sse2( *d14d15 = _mm_unpackhi_epi64(w7, w15); } +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them to 4x8 independently while flipping the second matrix horizontaly. Used +// for 14 taps pq pairs creation +static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *q0p0, + __m128i *q1p1, __m128i *q2p2, + __m128i *q3p3, __m128i *q4p4, + __m128i *q5p5, __m128i *q6p6, + __m128i *q7p7) { + __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi8( + *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 + w3 = _mm_unpackhi_epi8( + *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ww2 = _mm_unpacklo_epi16( + w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 + ww3 = _mm_unpackhi_epi16( + w2, + w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 + + *q7p7 = _mm_unpacklo_epi32( + ww0, + _mm_srli_si128( + ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx + *q6p6 = _mm_unpackhi_epi32( + _mm_slli_si128(ww0, 4), + ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx + *q5p5 = _mm_unpackhi_epi32( + ww0, + _mm_slli_si128( + ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx + *q4p4 = _mm_unpacklo_epi32( + _mm_srli_si128(ww0, 12), + ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx + *q3p3 = _mm_unpacklo_epi32( + ww1, + _mm_srli_si128( + ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx + *q2p2 = _mm_unpackhi_epi32( + _mm_slli_si128(ww1, 4), + ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx + *q1p1 = _mm_unpackhi_epi32( + ww1, + _mm_slli_si128( + ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx + *q0p0 = _mm_unpacklo_epi32( + _mm_srli_si128(ww1, 12), + ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx +} + static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *hev, __m128i *mask, __m128i *qs1qs0, __m128i *ps1ps0) { + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i t3t4 = + _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi32(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter2filter1 = + _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + filter = _mm_unpacklo_epi32(filter, filter); + + filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter); + hev1 = _mm_srli_si128(filter2filter1, 8); + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0) { const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); const __m128i t80 = _mm_set1_epi8(0x80); @@ -356,6 +524,49 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2( __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { __m128i q1p1, q0p0, p1p0, q1q0; __m128i abs_p0q0, abs_p1q1; + __m128i mask, flat, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); + + /* (abs(q1 - q0), abs(p1 - p0) */ + flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */ + abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi32(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4)); + + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} + +static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; __m128i mask, hev; const __m128i zero = _mm_setzero_si128(); @@ -390,14 +601,14 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2( mask = _mm_cmpeq_epi8(mask, zero); mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); - filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); } void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh) { const __m128i zero = _mm_setzero_si128(); - __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), _mm_loadl_epi64((const __m128i *)_limit)); __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); @@ -413,9 +624,9 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); xx_storel_32(s - 1 * p, ps1ps0); - xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8)); + xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4)); xx_storel_32(s + 0 * p, qs1qs0); - xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8)); + xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4)); } void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, @@ -425,7 +636,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, __m128i p1, p0, q0, q1; const __m128i zero = _mm_setzero_si128(); - __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), _mm_loadl_epi64((const __m128i *)_limit)); __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); @@ -442,8 +653,8 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0); // Transpose 8x4 to 4x8 - p1 = _mm_srli_si128(p1p0, 8); - q1 = _mm_srli_si128(q1q0, 8); + p1 = _mm_srli_si128(p1p0, 4); + q1 = _mm_srli_si128(q1q0, 4); transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); @@ -455,10 +666,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { xx_storel_32(s - (num + 1) * p, x); - xx_storel_32(s + num * p, _mm_srli_si128(x, 8)); + xx_storel_32(s + num * p, _mm_srli_si128(x, 4)); } -static AOM_FORCE_INLINE void lpf_internal_14_sse2( +static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2( __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, __m128i *thresh) { @@ -503,38 +714,31 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2( mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - // replicate for the further "merged variables" usage - mask = _mm_unpacklo_epi64(mask, mask); } // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0); qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0); // loopfilter done __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - { - __m128i work; - flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); - work = abs_diff(*q6p6, *q0p0); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { + __m128i work; + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; @@ -619,137 +823,413 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); flat_q1p1 = _mm_packus_epi16(res_p, res_q); - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), - 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); flat_q2p2 = _mm_packus_epi16(res_p, res_q); - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), - 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), - 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + // work with flat2 + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), - 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat = _mm_unpacklo_epi64(flat, flat); + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_unpacklo_epi64(flat2, flat2); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +} - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); +static AOM_FORCE_INLINE void lpf_internal_14_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i mask, hev, flat, flat2; + __m128i flat2_pq[6], flat_pq[3]; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; + __m128i abs_p1p0; - *q2p2 = _mm_andnot_si128(flat, *q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + __m128i fe, ff, work; + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + abs_p1p0 = abs_diff(*q1p1, *q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(fe, fe); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - *q5p5 = _mm_andnot_si128(flat2, *q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); - *q4p4 = _mm_andnot_si128(flat2, *q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; - *q3p3 = _mm_andnot_si128(flat2, *q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + } - *q2p2 = _mm_andnot_si128(flat2, *q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0); + qs1ps1 = _mm_srli_si128(qs0ps0, 8); + // loopfilter done - *q1p1 = _mm_andnot_si128(flat2, *q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pq_16[7]; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p6; + __m128i sum_p3; + + pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero); + pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero); + pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero); + pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero); + pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero); + pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero); + pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero); + q0_16 = _mm_srli_si128(pq_16[0], 8); + q1_16 = _mm_srli_si128(pq_16[1], 8); + q2_16 = _mm_srli_si128(pq_16[2], 8); + q3_16 = _mm_srli_si128(pq_16[3], 8); + q4_16 = _mm_srli_si128(pq_16[4], 8); + q5_16 = _mm_srli_si128(pq_16[5], 8); + + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + + __m128i work0, work0_0, work0_1, sum_p_0; + __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3])); + __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16)); + + sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]); + sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]); + + sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]); + sum_p = _mm_sub_epi16(sum_p_0, q5_16); + + work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]); + work0_1 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0]))); + + sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]); + sum_lp = _mm_sub_epi16(sum_lp, q2_16); + + work0 = _mm_add_epi16(sum_p3, pq_16[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]); + flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]); + + sum_lp = _mm_sub_epi16(sum_lp, q1_16); + sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]); + work0 = _mm_add_epi16(sum_p3, pq_16[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); - *q0p0 = _mm_andnot_si128(flat2, *q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + flat2 = _mm_unpacklo_epi32(flat2, flat2); + + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_pq[0] = _mm_and_si128(flat, flat_pq[0]); + *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_pq[1] = _mm_and_si128(flat, flat_pq[1]); + *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]); + + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_pq[2] = _mm_and_si128(flat, flat_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16)); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]); + flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]); + + sum_p = _mm_sub_epi16(sum_p, q4_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q3_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[3]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q2_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[2]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q1_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[1]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]); + *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]); + *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]); + *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]); + *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]); + *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; + } } void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, @@ -761,22 +1241,22 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), + q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), _mm_cvtsi32_si128(*(int *)(s + 4 * p))); - q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), + q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), _mm_cvtsi32_si128(*(int *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), + q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), _mm_cvtsi32_si128(*(int *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), + q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), _mm_cvtsi32_si128(*(int *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), + q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), _mm_cvtsi32_si128(*(int *)(s - 0 * p))); - q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), + q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), _mm_cvtsi32_si128(*(int *)(s + 5 * p))); - q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), + q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), _mm_cvtsi32_si128(*(int *)(s + 6 * p))); lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, @@ -790,7 +1270,7 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, store_buffer_horz_8(q5p5, p, 5, s); } -static AOM_FORCE_INLINE void lpf_internal_6_sse2( +static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2( __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, __m128i *thresh) { @@ -810,6 +1290,7 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); + { // filter_mask and hev_mask __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -847,8 +1328,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - // replicate for the further "merged variables" usage - mask = _mm_unpacklo_epi64(mask, mask); + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); // flat_mask flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); @@ -861,9 +1343,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( } // 5 tap filter - { + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_shft0, workp_shft1; p2_16 = _mm_unpacklo_epi8(*p2, zero); p1_16 = _mm_unpacklo_epi8(*p1, zero); @@ -906,18 +1388,149 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); } +} - // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0); +static AOM_FORCE_INLINE void lpf_internal_6_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16; + __m128i ps1ps0, qs1qs0; + + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + *p1p0 = _mm_unpacklo_epi32(*p0, *p1); + *q1q0 = _mm_unpacklo_epi32(*q0, *q1); + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + { + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - *q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - *p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + work = abs_diff(q2p2, q1p1); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); + + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + } + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; + pq2_16 = _mm_unpacklo_epi8(q2p2, zero); + pq1_16 = _mm_unpacklo_epi8(q1p1, zero); + pq0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_srli_si128(pq0_16, 8); + q2_16 = _mm_srli_si128(pq2_16, 8); + + // op1 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + workp_b = _mm_srli_epi16(workp_b, 3); + + flat_p1p0 = _mm_packus_epi16(workp_b, workp_b); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16), + pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16), + pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + workp_a = _mm_srli_epi16(workp_a, 3); + + flat_q0q1 = _mm_packus_epi16(workp_a, workp_a); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + } } void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, @@ -941,9 +1554,9 @@ void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, &limit, &thresh); xx_storel_32(s - 1 * p, p1p0); - xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); xx_storel_32(s + 0 * p, q1q0); - xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); } void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, @@ -970,8 +1583,8 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, - &limit, &thresh); + lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); @@ -982,15 +1595,168 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, static AOM_FORCE_INLINE void lpf_internal_8_sse2( __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, - __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit, - __m128i *thresh) { + __m128i *blimit, __m128i *limit, __m128i *thresh) { const __m128i zero = _mm_setzero_si128(); __m128i mask, hev, flat; __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, flat_p1p0, flat_q0q1; __m128i q2p2, q1p1, q0p0; __m128i q1q0, p1p0, ps1ps0, qs1qs0; - __m128i work_a, op2, oq2; + __m128i work_pq, opq2, pq2; + + q3p3 = _mm_unpacklo_epi32(*p3, *q3); + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0 + q1q0 = _mm_srli_si128(p1p0, 8); + + // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + + // flat_mask4 + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft2 = _mm_add_epi16(workp_a, workp_b); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_c = _mm_add_epi16(workp_a, workp_b); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_d, workp_c); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_p1p0 = _mm_packus_epi16(workp_c, workp_c); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + workp_c = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_c, workp_d); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_q0q1 = _mm_packus_epi16(workp_c, workp_c); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1); + workp_c = _mm_srli_epi16(workp_c, 3); + + opq2 = _mm_packus_epi16(workp_c, workp_c); + + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 4); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *blimit, __m128i *limit, __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_pq, opq2, pq2; q3p3 = _mm_unpacklo_epi64(*p3, *q3); q2p2 = _mm_unpacklo_epi64(*p2, *q2); @@ -1043,11 +1809,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - // replicate for the further "merged variables" usage - mask = _mm_unpacklo_epi64(mask, mask); - // flat_mask4 + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + // flat_mask4 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); flat = _mm_max_epu8(abs_p1p0, flat); @@ -1059,11 +1825,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( flat = _mm_unpacklo_epi64(flat, flat); } - // filter8 - { + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2; p2_16 = _mm_unpacklo_epi8(*p2, zero); p1_16 = _mm_unpacklo_epi8(*p1, zero); p0_16 = _mm_unpacklo_epi8(*p0, zero); @@ -1078,8 +1844,7 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); - workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - op2 = _mm_packus_epi16(workp_shft0, workp_shft0); + workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // op1 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); @@ -1108,27 +1873,22 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - oq2 = _mm_packus_epi16(workp_shft1, workp_shft1); - } - // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + opq2 = _mm_packus_epi16(workp_shft2, workp_shft1); - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 8); - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - work_a = _mm_andnot_si128(flat, *q2); - q2_16 = _mm_and_si128(flat, oq2); - *q2_out = _mm_or_si128(work_a, q2_16); - - work_a = _mm_andnot_si128(flat, *p2); - p2_16 = _mm_and_si128(flat, op2); - *p2_out = _mm_or_si128(work_a, p2_16); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } } void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, @@ -1136,7 +1896,7 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, const unsigned char *_limit, const unsigned char *_thresh) { __m128i p2, p1, p0, q0, q1, q2, p3, q3; - __m128i q1q0, p1p0, p2_out, q2_out; + __m128i q1q0, p1p0; __m128i blimit = _mm_load_si128((const __m128i *)_blimit); __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); @@ -1151,14 +1911,14 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p)); lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, - &p2_out, &q2_out, &blimit, &limit, &thresh); + &blimit, &limit, &thresh); xx_storel_32(s - 1 * p, p1p0); - xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); xx_storel_32(s + 0 * p, q1q0); - xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); - xx_storel_32(s - 3 * p, p2_out); - xx_storel_32(s + 2 * p, q2_out); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); + xx_storel_32(s - 3 * p, p2); + xx_storel_32(s + 2 * p, q2); } void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, @@ -1196,8 +1956,8 @@ void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)), _mm_loadl_epi64((__m128i *)(s + 6 * p))); - lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, - &limit, &thresh); + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8)); @@ -1227,7 +1987,7 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, _mm_load_si128((__m128i *)_thresh1)); __m128i p2, p1, p0, q0, q1, q2, p3, q3; - __m128i q1q0, p1p0, p2_out, q2_out; + __m128i q1q0, p1p0; p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); @@ -1238,15 +1998,15 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); - lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, - &p2_out, &q2_out, &blimit, &limit, &thresh); + lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &blimit, &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); - _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); } void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, @@ -1282,7 +2042,7 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); - lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8)); @@ -1331,7 +2091,7 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0, &q1); - lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); p1 = _mm_srli_si128(ps1ps0, 8); q1 = _mm_srli_si128(qs1qs0, 8); @@ -1372,8 +2132,8 @@ void aom_lpf_vertical_6_sse2(unsigned char *s, int p, lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit, &limit, &thresh); - p0 = _mm_srli_si128(p1p0, 8); - q0 = _mm_srli_si128(q1q0, 8); + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); @@ -1419,8 +2179,8 @@ void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, d5 = _mm_srli_si128(d4d5, 8); d7 = _mm_srli_si128(d6d7, 8); - lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit, - &limit, &thresh); + lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, + &blimit, &limit, &thresh); p0 = _mm_srli_si128(p1p0, 8); q0 = _mm_srli_si128(q1q0, 8); @@ -1444,7 +2204,7 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p, const unsigned char *_thresh) { __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i p2, p0, q0, q2; + __m128i p0, q0; __m128i x2, x1, x0, x3; __m128i q1q0, p1p0; __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -1459,13 +2219,13 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p, transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); // Loop filtering - lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2, - &q2, &blimit, &limit, &thresh); + lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, + &blimit, &limit, &thresh); - p0 = _mm_srli_si128(p1p0, 8); - q0 = _mm_srli_si128(q1q0, 8); + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); - transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1, + transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1, &d2, &d3); _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0); @@ -1490,7 +2250,7 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i d1, d3, d5, d7; __m128i q1q0, p1p0; - __m128i p2, p1, q1, q2; + __m128i p1, q1; __m128i d0d1, d2d3, d4d5, d6d7; x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p)); @@ -1510,14 +2270,14 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, d5 = _mm_srli_si128(d4d5, 8); d7 = _mm_srli_si128(d6d7, 8); - lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0, - &p1p0, &p2, &q2, &blimit, &limit, &thresh); + lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, + &q1q0, &p1p0, &blimit, &limit, &thresh); p1 = _mm_srli_si128(p1p0, 8); q1 = _mm_srli_si128(q1q0, 8); - transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3, - &d4d5, &d6d7); + transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1, + &d2d3, &d4d5, &d6d7); _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1); _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8)); @@ -1533,65 +2293,30 @@ void aom_lpf_vertical_14_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; - __m128i x6, x5, x4, x3, x2, x1, x0; - __m128i p0, p1, p2, p3, p4, p5, p6, p7; - __m128i q0, q1, q2, q3, q4, q5, q6, q7; - __m128i p0_out, p1_out, p2_out, p3_out; + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x6, x5, x4, x3; + __m128i pq0, pq1, pq2, pq3; __m128i blimit = _mm_load_si128((__m128i *)_blimit); __m128i limit = _mm_load_si128((__m128i *)_limit); __m128i thresh = _mm_load_si128((__m128i *)_thresh); - x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p)); - x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p)); - x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p)); - - transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6, - &p7); + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); - x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); - x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); - - transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, - &q7); - - q6p6 = _mm_unpacklo_epi64(p1, q6); - q5p5 = _mm_unpacklo_epi64(p2, q5); - q4p4 = _mm_unpacklo_epi64(p3, q4); - q3p3 = _mm_unpacklo_epi64(p4, q3); - q2p2 = _mm_unpacklo_epi64(p5, q2); - q1p1 = _mm_unpacklo_epi64(p6, q1); - q0p0 = _mm_unpacklo_epi64(p7, q0); + transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4, + &q5p5, &q6p6, &q7p7); lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, &limit, &thresh); - transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, - &p0_out, &p1_out, &p2_out, &p3_out); - - x0 = _mm_srli_si128(q0p0, 8); - x1 = _mm_srli_si128(q1p1, 8); - x2 = _mm_srli_si128(q2p2, 8); - x3 = _mm_srli_si128(q3p3, 8); - x4 = _mm_srli_si128(q4p4, 8); - x5 = _mm_srli_si128(q5p5, 8); - x6 = _mm_srli_si128(q6p6, 8); - - transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2, - &q3); - - _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out); - _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out); - _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out); - _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out); - - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); - _mm_storel_epi64((__m128i *)(s + 3 * p), q3); + transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &pq0, &pq1, &pq2, &pq3); + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3); } void aom_lpf_vertical_14_dual_sse2( @@ -1634,8 +2359,8 @@ void aom_lpf_vertical_14_dual_sse2( q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8)); q7 = _mm_srli_si128(d14d15, 8); - lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, - &limit, &thresh); + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); x0 = _mm_srli_si128(q0p0, 8); x1 = _mm_srli_si128(q1p1, 8); diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h index c6b6469b4..8970fe7dd 100644 --- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h +++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H -#define _AOM_DSP_X86_LPF_COMMON_X86_H +#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ #include <emmintrin.h> // SSE2 @@ -212,4 +212,4 @@ static INLINE void highbd_transpose8x16_sse2( d4 + 1, d5 + 1, d6 + 1, d7 + 1); } -#endif // _AOM_DSP_X86_LPF_COMMON_X86_H +#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c index 6538e4d5e..584b5e7e3 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c @@ -9,7 +9,6 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include <stdio.h> #include <tmmintrin.h> #include "config/aom_config.h" diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h index 19b429d91..cffbd9672 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H -#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H +#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, @@ -30,4 +30,4 @@ unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, const uint8_t *m_ptr, int m_stride, int height); -#endif +#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h index dc41a8342..4faa098ac 100644 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H -#define _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H +#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ #include <stdlib.h> #include <string.h> @@ -89,4 +89,4 @@ static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height, } while (i < height); } -#endif +#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h index 8b69606dd..6c821673e 100644 --- a/third_party/aom/aom_dsp/x86/mem_sse2.h +++ b/third_party/aom/aom_dsp/x86/mem_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_MEM_SSE2_H_ -#define AOM_DSP_X86_MEM_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_ +#define AOM_AOM_DSP_X86_MEM_SSE2_H_ #include <emmintrin.h> // SSE2 @@ -39,4 +39,4 @@ static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, return dst; } -#endif // AOM_DSP_X86_MEM_SSE2_H_ +#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h new file mode 100644 index 000000000..5181e444c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ + +#include <smmintrin.h> + +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" + +static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int h) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n)); + const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n)); + const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n)); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h index a3535f985..48486c6c4 100644 --- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ -#define AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ #include <immintrin.h> @@ -42,4 +42,13 @@ static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); } -#endif // AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); + const __m128i v_tmp_d = + _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c new file mode 100644 index 000000000..bfec0e8a8 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m128i v_d; + const uint8_t *pre_temp; + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp); + const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d); + const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d); + + const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_tmp_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d); + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12); + const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d); + const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1); + + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 8; + n += 8; + width -= 8; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + v_d = _mm_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm_hadd_epi32(v_d, v_d); + *sum = _mm_cvtsi128_si32(v_d); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4)); +} + +static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m256i v_d; + __m128i res0; + const uint8_t *pre_temp; + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + + assert(w >= 16); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp); + const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_m1_d = + _mm256_loadu_si256((__m256i const *)(mask + n + 8)); + const __m256i v_w1_d = + _mm256_loadu_si256((__m256i const *)(wsrc + n + 8)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8)); + + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d); + + const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31); + + const __m256i v_tmp0_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d); + const __m256i v_tmp1_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d); + + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12); + const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12); + + const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 16; + n += 16; + width -= 16; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + + v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm256_hadd_epi32(v_d, v_d); + res0 = _mm256_castsi256_si128(v_d); + res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1)); + *sum = _mm_cvtsi128_si32(res0); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4)); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else if (W == 8) { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } else { \ + obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) +OBMCVARWXH(4, 16) +OBMCVARWXH(16, 4) +OBMCVARWXH(8, 32) +OBMCVARWXH(32, 8) +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 2e2f6e09f..72eda0e57 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -19,7 +19,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" #include "aom_dsp/x86/synonyms.h" //////////////////////////////////////////////////////////////////////////////// @@ -36,45 +36,6 @@ void aom_var_filter_block2d_bil_second_pass_ssse3( unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter); -static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, - const int32_t *wsrc, const int32_t *mask, - unsigned int *const sse, int *const sum, - const int h) { - const int pre_step = pre_stride - 4; - int n = 0; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_d = _mm_setzero_si128(); - - assert(IS_POWER_OF_TWO(h)); - - do { - const __m128i v_p_b = xx_loadl_32(pre + n); - const __m128i v_m_d = xx_load_128(mask + n); - const __m128i v_w_d = xx_load_128(wsrc + n); - - const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); - - const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); - const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); - const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); - - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); - v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); - - n += 4; - - if (n % 4 == 0) pre += pre_step; - } while (n < 4 * h); - - *sum = xx_hsum_epi32_si32(v_sum_d); - *sse = xx_hsum_epi32_si32(v_sse_d); -} - static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm index e6b40262d..216a0bd8f 100644 --- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm @@ -16,16 +16,12 @@ SECTION .text %macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan vzeroupper - ; If we can skip this block, then just zero the output - cmp skipmp, 0 - jne .blank - %ifnidn %1, b_32x32 ; Special case for ncoeff == 16, as it is frequent and we can save on @@ -83,14 +79,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ .single_nonzero: ; Actual quantization of size 16 block - setup pointers, rounders, etc. - movifnidn r4, roundmp - movifnidn r5, quantmp - mov r3, dequantmp - mov r6, shiftmp - mova m1, [r4] ; m1 = round - mova m2, [r5] ; m2 = quant - mova m3, [r3] ; m3 = dequant - mova m4, [r6] ; m4 = shift + movifnidn r3, roundmp + movifnidn r4, quantmp + mov r6, dequantmp + mov r5, shiftmp + mova m1, [r3] ; m1 = round + mova m2, [r4] ; m2 = quant + mova m3, [r6] ; m3 = dequant + mova m4, [r5] ; m4 = shift mov r3, iscanmp @@ -174,20 +170,20 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endif ; %ifnidn %1, b_32x32 -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ +DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ qcoeff, dqcoeff, dequant, eob, scan, iscan ; Actual quantization loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp + movifnidn dequantq, dequantmp mova m0, [zbinq] ; m0 = zbin mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant - mova m3, [r2] ; m3 = dequant + mova m3, [dequantq] ; m3 = dequant pcmpeqw m4, m4 ; All lanes -1 %ifidn %1, b_32x32 psubw m0, m4 @@ -199,7 +195,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ mov r2, shiftmp mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift + mova m4, [r2] ; m4 = shift mov r4, dqcoeffmp mov r5, iscanmp %ifidn %1, b_32x32 @@ -207,7 +203,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ %endif pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob lea coeffq, [ coeffq+ncoeffq*4] @@ -432,39 +428,8 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ mov [r2], ax vzeroupper RET - - ; Skip-block, i.e. just write all zeroes -.blank: - -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ - qcoeff, dqcoeff, dequant, eob, scan, iscan - - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - -DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - neg ncoeffq - pxor m7, m7 - -.blank_loop: - mova [dqcoeffq+ncoeffq*4+ 0], ymm7 - mova [dqcoeffq+ncoeffq*4+32], ymm7 - mova [qcoeffq+ncoeffq*4+ 0], ymm7 - mova [qcoeffq+ncoeffq*4+32], ymm7 - add ncoeffq, mmsize - jl .blank_loop - - mov [eobq], word 0 - - vzeroupper - RET %endmacro INIT_XMM avx -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c index 46b9c7d29..d3de6e24d 100644 --- a/third_party/aom/aom_dsp/x86/quantize_sse2.c +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -9,242 +9,139 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <assert.h> #include <emmintrin.h> #include <xmmintrin.h> #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { - if (sizeof(tran_low_t) == 4) { - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); - } else { - return _mm_load_si128((const __m128i *)coeff_ptr); - } + assert(sizeof(tran_low_t) == 4); + + return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); } static INLINE void store_coefficients(__m128i coeff_vals, tran_low_t *coeff_ptr) { - if (sizeof(tran_low_t) == 4) { - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); - } else { - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); - } + assert(sizeof(tran_low_t) == 4); + + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); } void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - __m128i zero; + const __m128i zero = _mm_setzero_si128(); + int index = 16; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + (void)scan_ptr; - coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - if (!skip_block) { - __m128i eob; - __m128i zbin; - __m128i round, quant, dequant, shift; - { - __m128i coeff0, coeff1; - - // Setup global values - { - __m128i pw_1; - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - pw_1 = _mm_set1_epi16(1); - zbin = _mm_sub_epi16(zbin, pw_1); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - // Do DC and first 15 AC - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - shift = _mm_unpackhi_epi64(shift, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - store_coefficients(zero, dqcoeff_ptr + n_coeffs); - store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); - store_coefficients(zero, qcoeff_ptr + n_coeffs); - store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr); + store_coefficients(coeff1, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + index); + store_coefficients(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; } + + *eob_ptr = accumulate_eob(eob); } diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm index e2c1ebb71..39d4ca674 100644 --- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -18,21 +18,18 @@ pw_1: times 8 dw 1 SECTION .text -; TODO(yunqingwang)fix quantize_b code for skip=1 case. %macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan - cmp dword skipm, 0 - jne .blank ; actual quantize loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp + movifnidn dequantq, dequantmp mova m0, [zbinq] ; m0 = zbin mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant @@ -44,18 +41,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psrlw m0, 1 ; m0 = (m0 + 1) / 2 psrlw m1, 1 ; m1 = (m1 + 1) / 2 %endif - mova m3, [r2q] ; m3 = dequant - psubw m0, [GLOBAL(pw_1)] + mova m3, [dequantq] ; m3 = dequant mov r2, shiftmp - mov r3, qcoeffmp + psubw m0, [GLOBAL(pw_1)] mova m4, [r2] ; m4 = shift + mov r3, qcoeffmp mov r4, dqcoeffmp mov r5, iscanmp %ifidn %1, b_32x32 psllw m4, 1 %endif pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob lea coeffq, [ coeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] lea dqcoeffq, [dqcoeffq+ncoeffq*4] @@ -268,33 +265,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pextrw r6, m8, 0 mov [r2], r6 RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - neg ncoeffq - pxor m7, m7 -.blank_loop: - mova [dqcoeffq+ncoeffq*4+ 0], m7 - mova [dqcoeffq+ncoeffq*4+16], m7 - mova [dqcoeffq+ncoeffq*4+32], m7 - mova [dqcoeffq+ncoeffq*4+48], m7 - mova [qcoeffq+ncoeffq*4+ 0], m7 - mova [qcoeffq+ncoeffq*4+16], m7 - mova [qcoeffq+ncoeffq*4+32], m7 - mova [qcoeffq+ncoeffq*4+48], m7 - add ncoeffq, mmsize - jl .blank_loop - mov word [eobq], 0 - RET %endmacro INIT_XMM ssse3 -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h new file mode 100644 index 000000000..4eed7dd29 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_x86.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "aom/aom_integer.h" + +static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, + const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, __m128i *dequant, + const int16_t *shift_ptr, __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)zbin_ptr); + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)shift_ptr); +} + +// With ssse3 and later abs() and sign() are preferred. +static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi16(a, sign); +} + +static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, + const __m128i quant, const __m128i shift) { + __m128i tmp, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + *coeff = _mm_mulhi_epi16(qcoeff, shift); +} + +static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { + return _mm_mullo_epi16(qcoeff, dequant); +} + +// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing +// to zbin to add 1 to the index in 'scan'. +static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, + const __m128i zbin_mask0, + const __m128i zbin_mask1, + const int16_t *scan_ptr, const int index, + const __m128i zero) { + const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); + __m128i eob0, eob1; + // Add one to convert from indices to counts + scan0 = _mm_sub_epi16(scan0, zbin_mask0); + scan1 = _mm_sub_epi16(scan1, zbin_mask1); + eob0 = _mm_andnot_si128(zero_coeff0, scan0); + eob1 = _mm_andnot_si128(zero_coeff1, scan1); + return _mm_max_epi16(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c new file mode 100644 index 000000000..305dde5c0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sse_avx2.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <smmintrin.h> +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = yy_loadu_256(a); + const __m256i v_b0 = yy_loadu_256(b); + const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0)); + const __m256i v_a01_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1)); + const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0)); + const __m256i v_b01_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1)); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + xx_storel_64(&sum, sum_1x64); + return sum; +} + +int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_a2 = xx_loadl_32(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_32(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_b2 = xx_loadl_32(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m256i v_a_w = + _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = + _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_b0 = xx_loadu_128(b); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + sse_w32_avx2(&sum, a + 64, b + 64); + sse_w32_avx2(&sum, a + 96, b + 96); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: break; + } + + return sse; +} + +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = yy_loadu_256(a); + const __m256i v_b_w = yy_loadu_256(b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_a2 = xx_loadl_64(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_64(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_b2 = xx_loadl_64(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_64(b + b_stride * 3); + const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1), + _mm_unpacklo_epi64(v_a2, v_a3)); + const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1), + _mm_unpacklo_epi64(v_b2, v_b3)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + const __m256i v_a_w = yy_loadu2_128(a + a_stride, a); + const __m256i v_b_w = yy_loadu2_128(b + b_stride, b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); + highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4); + highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5); + highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6); + highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: break; + } + return sse; +} diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c new file mode 100644 index 000000000..8b5af8469 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sse_sse4.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <smmintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_b0 = xx_loadu_128(b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); + sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); + sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); + sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: break; + } + + return sse; +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = xx_loadu_128(a); + const __m128i v_b_w = xx_loadu_128(b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8, b + 8); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); + highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8); + highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9); + highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10); + highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11); + highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12); + highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13); + highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14); + highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: break; + } + return sse; +} diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c new file mode 100644 index 000000000..0af44e3a4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> +#include <smmintrin.h> + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/sum_squares_sse2.h" +#include "config/aom_dsp_rtcd.h" + +static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride, + int width, int height) { + uint64_t result; + __m256i v_acc_q = _mm256_setzero_si256(); + const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff); + for (int col = 0; col < height; col += 4) { + __m256i v_acc_d = _mm256_setzero_si256(); + for (int row = 0; row < width; row += 16) { + const int16_t *tempsrc = src + row; + const __m256i v_val_0_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); + const __m256i v_val_1_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); + const __m256i v_val_2_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); + const __m256i v_val_3_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); + + const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); + const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); + const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); + const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); + + const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); + const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); + const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d); + + v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d); + } + v_acc_q = + _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32)); + src += 4 * stride; + } + __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q); + __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1); + __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value); + + result_64_2_int = _mm_add_epi64( + result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int)); + + xx_storel_64(&result, result_64_2_int); + + return result; +} + +uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width, + int height) { + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY(width == 8 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { + return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c index a79f22d79..22d7739ec 100644 --- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c @@ -14,6 +14,7 @@ #include <stdio.h> #include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/sum_squares_sse2.h" #include "config/aom_dsp_rtcd.h" static INLINE __m128i xx_loadh_64(__m128i a, const void *b) { @@ -44,8 +45,7 @@ static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) { return _mm_add_epi32(v_sq_01_d, v_sq_23_d); } -static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, - int stride) { +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride); __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); @@ -53,8 +53,8 @@ static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, return (uint64_t)_mm_cvtsi128_si32(v_sum_d); } -static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, - int height) { +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height) { int r = 0; __m128i v_acc_q = _mm_setzero_si128(); do { @@ -76,7 +76,7 @@ static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, // maintenance instructions in the common case of 4x4. __attribute__((noinline)) #endif -static uint64_t +uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, int height) { int r = 0; diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h new file mode 100644 index 000000000..491e31cc5 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_ +#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_ + +uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, + int width, int height); + +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height); +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride); + +#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index d9a53fcc5..1e9f1e27b 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_SYNONYMS_H_ -#define AOM_DSP_X86_SYNONYMS_H_ +#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_H_ #include <immintrin.h> @@ -103,15 +103,6 @@ static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) { return _mm_srai_epi32(v_tmp_d, bits); } -// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) -static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); - const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); - const __m128i v_tmp_d = - _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); - return _mm_srai_epi32(v_tmp_d, bits); -} - static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15); @@ -120,4 +111,4 @@ static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { return _mm_srai_epi16(v_tmp_d, bits); } -#endif // AOM_DSP_X86_SYNONYMS_H_ +#endif // AOM_AOM_DSP_X86_SYNONYMS_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h index 39f371fc9..3f69b120e 100644 --- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h +++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_ -#define AOM_DSP_X86_SYNONYMS_AVX2_H_ +#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ #include <immintrin.h> @@ -61,4 +61,14 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); } -#endif // AOM_DSP_X86_SYNONYMS_AVX2_H_ +static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { + __m128i mhi = _mm_loadu_si128((__m128i *)(hi)); + __m128i mlo = _mm_loadu_si128((__m128i *)(lo)); + return yy_set_m128i(mhi, mlo); +} + +static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { + const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); + return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256()); +} +#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h index f88a1527d..d0d1ee684 100644 --- a/third_party/aom/aom_dsp/x86/transpose_sse2.h +++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_ -#define AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ #include <emmintrin.h> // SSE2 @@ -417,4 +417,4 @@ static INLINE void transpose_32bit_8x4(const __m128i *const in, out[7] = _mm_unpackhi_epi64(a6, a7); } -#endif // AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h index bdff64b8f..b1611ba87 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_ -#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ #include <emmintrin.h> #include "aom/aom_integer.h" @@ -196,4 +196,4 @@ static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { } #endif -#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h index 58a792424..ed82eee96 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_ -#define AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ #include <emmintrin.h> #include "aom/aom_integer.h" @@ -26,4 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) { return _mm_shuffle_epi32(b, 0x4e); } -#endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c index a7ac2c93d..800aef126 100644 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -433,13 +433,14 @@ static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, return comp; } -void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8, +void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i = 0; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); const uint16_t *src0 = invert_mask ? pred : ref; const uint16_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index 7e3c5d5db..3c37e77c0 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -16,6 +16,7 @@ #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" +#include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" @@ -485,7 +486,8 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref, int ref_stride) { + const uint8_t *ref, int ref_stride, + int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; @@ -553,7 +555,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); const InterpFilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); @@ -570,7 +572,10 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, } const InterpFilterParams *filter = - av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS; if (!subpel_x_q3 && !subpel_y_q3) { if (width >= 16) { @@ -632,15 +637,25 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); + uint8_t *temp_start_horiz = + (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp; + uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); + int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1), - ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, - width, intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), - MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, - width, height); + // TODO(Deepa): Remove the memset below when we have + // 4 tap simd for sse2 and ssse3. + if (subpel_search == 1) { + memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width); + memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width); + memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width); + memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width); + } + aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, + kernel_x, 16, NULL, -1, width, intermediate_height); + aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, + kernel_y, 16, width, height); } } @@ -648,11 +663,11 @@ void aom_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride) { + int ref_stride, int subpel_search) { int n; int i; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ assert(!(width * height & 15)); n = width * height >> 4; @@ -664,3 +679,128 @@ void aom_comp_avg_upsampled_pred_sse2( pred += 16; } } + +void aom_comp_mask_upsampled_pred_sse2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int subpel_search) { + if (subpel_x_q3 | subpel_y_q3) { + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + ref = comp_pred; + ref_stride = width; + } + aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); +} + +static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0, + const __m128i s1, + const __m128i a) { + const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i a_inv = _mm_sub_epi16(alpha_max, a); + + const __m128i s_lo = _mm_unpacklo_epi16(s0, s1); + const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv); + const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo); + const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i s_hi = _mm_unpackhi_epi16(s0, s1); + const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv); + const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi); + const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i comp = _mm_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m128i zero = _mm_setzero_si128(); + + if (width == 8) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 16) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 32) { + do { + for (int j = 0; j < 2; j++) { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16)); + const __m128i s2 = + _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16)); + const __m128i s3 = + _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16)); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1); + } + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } +} |