diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 23:00:02 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 23:00:02 -0500 |
commit | b8df135c97a854c2ff9b4394b016649c601177fa (patch) | |
tree | 802b7de5ad245f1a12adbcef835ab0d0687c1bf8 /third_party/aom/aom_dsp/x86 | |
parent | a4d3c59dcac642f6b9557dc09b60eda40b517630 (diff) | |
download | UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.gz UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.lz UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.xz UXP-b8df135c97a854c2ff9b4394b016649c601177fa.zip |
Update libaom to rev b25610052a1398032320008d69b51d2da94f5928
Diffstat (limited to 'third_party/aom/aom_dsp/x86')
17 files changed, 1835 insertions, 326 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c index af45a03ac..f3fe50372 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -41,20 +41,290 @@ #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // __clang__ +static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a)); + *((uint32_t *)(output_ptr + stride)) = + _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_storel_epi64((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_store2_mi128(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_store_si128((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static void aom_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg; + __m256i firstFilters, secondFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 32 bits + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + static void aom_filter_block1d16_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; - __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); @@ -74,22 +344,17 @@ static void aom_filter_block1d16_h8_avx2( // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { // load the 2 strides of source - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg32b1 = _mm256_inserti128_si256( - srcReg32b1, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), - 1); + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); @@ -110,22 +375,13 @@ static void aom_filter_block1d16_h8_avx2( srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg32b2 = _mm256_inserti128_si256( - srcReg32b2, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), - 1); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); @@ -148,32 +404,21 @@ static void aom_filter_block1d16_h8_avx2( // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); - srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result + // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr += src_stride; - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, - _mm256_castsi256_si128(srcRegFilt32b1_1)); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + output_pitch), - _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); output_ptr += dst_stride; } @@ -183,7 +428,7 @@ static void aom_filter_block1d16_h8_avx2( __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); @@ -210,15 +455,11 @@ static void aom_filter_block1d16_h8_avx2( // add and saturate the results together srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); @@ -245,19 +486,16 @@ static void aom_filter_block1d16_h8_avx2( // add and saturate the results together srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + // shift by 6 bit each 16 bit srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve @@ -269,11 +507,163 @@ static void aom_filter_block1d16_h8_avx2( } } +static void aom_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b11 = srcReg32b2; + srcReg32b2 = srcReg32b4; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); + + // shift by 6 bit each 16 bit + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1); + } +} + static void aom_filter_block1d16_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; - __m256i addFilterReg64; + __m256i addFilterReg32; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; @@ -281,11 +671,11 @@ static void aom_filter_block1d16_v8_avx2( unsigned int i; ptrdiff_t src_stride, dst_stride; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); @@ -308,49 +698,26 @@ static void aom_filter_block1d16_v8_avx2( dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr))); - srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); - srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); - srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); - srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); - srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register - srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm256_castsi256_si128(srcReg32b2), 1); - srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm256_castsi256_si128(srcReg32b3), 1); - srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, - _mm256_castsi256_si128(srcReg32b4), 1); - srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, - _mm256_castsi256_si128(srcReg32b5), 1); - srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, - _mm256_castsi256_si128(srcReg32b6), 1); - srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, - _mm256_castsi256_si128(srcReg32b7), 1); - + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); - - // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); - - // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); - - // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i -= 2) { @@ -383,9 +750,7 @@ static void aom_filter_block1d16_v8_avx2( // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); @@ -399,16 +764,13 @@ static void aom_filter_block1d16_v8_avx2( // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); - // shift by 7 bit each 16 bit - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); - srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve @@ -417,12 +779,7 @@ static void aom_filter_block1d16_v8_avx2( src_ptr += src_stride; - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1)); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + out_pitch), - _mm256_extractf128_si256(srcReg32b1, 1)); + xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1); output_ptr += dst_stride; @@ -475,24 +832,17 @@ static void aom_filter_block1d16_v8_avx2( // add and saturate the results together srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7)); - // add and saturate the results together + // shift by 6 bit each 16 bit srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); - - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve @@ -506,21 +856,6 @@ static void aom_filter_block1d16_v8_avx2( #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction aom_filter_block1d4_v8_ssse3; -#if ARCH_X86_64 -filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; -#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3 -#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3 -#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 -filter8_1dfunction aom_filter_block1d8_v8_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_ssse3; -#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3 -#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3 -#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3 -#endif // ARCH_X86_64 filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; filter8_1dfunction aom_filter_block1d8_v2_ssse3; diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h index 7790baf2e..72fabd236 100644 --- a/third_party/aom/aom_dsp/x86/convolve_avx2.h +++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h @@ -13,31 +13,27 @@ #define AOM_DSP_X86_CONVOLVE_AVX2_H_ // filters for 16 -DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, + 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, }; static INLINE void prepare_coeffs_lowbd( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *const filter = av1_get_interp_filter_subpel_kernel( - *filter_params, subpel_q4 & SUBPEL_MASK); + filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); @@ -65,7 +61,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *filter = av1_get_interp_filter_subpel_kernel( - *filter_params, subpel_q4 & SUBPEL_MASK); + filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h index 846fe7bb4..399df5d6d 100644 --- a/third_party/aom/aom_dsp/x86/convolve_sse2.h +++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h @@ -19,7 +19,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, const int subpel_q4, __m128i *const coeffs /* [4] */) { const int16_t *filter = av1_get_interp_filter_subpel_kernel( - *filter_params, subpel_q4 & SUBPEL_MASK); + filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeff = _mm_loadu_si128((__m128i *)filter); // coeffs 0 1 0 1 0 1 0 1 diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c index e5e3238d5..099fcf7fc 100644 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c @@ -105,8 +105,8 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { int i, j; @@ -254,8 +254,8 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { int i, j; diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c index f7ac9b496..e7b33d1c4 100644 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c @@ -18,8 +18,8 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { @@ -166,8 +166,8 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index fdfadc886..131c16aa9 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -676,7 +676,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, } } - const InterpFilterParams filter = + const InterpFilterParams *filter = av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { @@ -726,14 +726,14 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), + aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); aom_highbd_convolve8_vert( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, 16, width, height, bd); } diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c index 9801e285c..eaf1f347b 100644 --- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c +++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c @@ -22,118 +22,12 @@ void aom_var_filter_block2d_bil_first_pass_ssse3( const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { - // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow - // in computation using _mm_maddubs_epi16. - // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. - const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; - const __m128i r = _mm_set1_epi16(round); - const uint8_t f0 = filter[0] >> 1; - const uint8_t f1 = filter[1] >> 1; - const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, - f0, f1, f0, f1, f0, f1); - const __m128i shuffle_mask = - _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); - unsigned int i, j; - (void)pixel_step; - - if (output_width >= 8) { - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 8) { - // load source - __m128i source_low = xx_loadl_64(a); - __m128i source_hi = _mm_setzero_si128(); - - // avoid load undefined memory - if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8); - __m128i source = _mm_unpacklo_epi64(source_low, source_hi); - - // shuffle to: - // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], - // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] - __m128i res = _mm_maddubs_epi16(source_shuffle, filters); - - // round - res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); - - xx_storeu_128(b, res); - - a += 8; - b += 8; - } - - a += src_pixels_per_line - output_width; - } - } else { - for (i = 0; i < output_height; ++i) { - // load source, only first 5 values are meaningful: - // { a[0], a[1], a[2], a[3], a[4], xxxx } - __m128i source = xx_loadl_64(a); - - // shuffle, up to the first 8 are useful - // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], - // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - __m128i res = _mm_maddubs_epi16(source_shuffle, filters); - res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); - - xx_storel_64(b, res); - - a += src_pixels_per_line; - b += output_width; - } - } -} + unsigned int output_width, const uint8_t *filter); void aom_var_filter_block2d_bil_second_pass_ssse3( const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { - const int16_t round = (1 << FILTER_BITS) >> 1; - const __m128i r = _mm_set1_epi32(round); - const __m128i filters = - _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], - filter[1], filter[0], filter[1]); - const __m128i shuffle_mask = - _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); - const __m128i mask = - _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - unsigned int i, j; - - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 4) { - // load source as: - // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } - __m128i source1 = xx_loadl_64(a); - __m128i source2 = xx_loadl_64(a + pixel_step); - __m128i source = _mm_unpacklo_epi64(source1, source2); - - // shuffle source to: - // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - // b[i] = a[i] * filter[0] + a[w + i] * filter[1] - __m128i res = _mm_madd_epi16(source_shuffle, filters); - - // round - res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); - - // shuffle to get each lower 8 bit of every 32 bit - res = _mm_shuffle_epi8(res, mask); - - xx_storel_32(b, res); - - a += 4; - b += 4; - } - - a += src_pixels_per_line - output_width; - } -} + unsigned int output_width, const uint8_t *filter); static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, const __m128i *w, const __m128i *r, diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c new file mode 100644 index 000000000..6538e4d5e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdio.h> +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" + +static INLINE unsigned int masked_sad32xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 32) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return (sad + 31) >> 6; +} + +static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) { + __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo)); + __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi)); + __m256i a = _mm256_castsi128_si256(a0); + return _mm256_inserti128_si256(a, a1, 1); +} + +static INLINE unsigned int masked_sad16xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return (sad + 31) >> 6; +} + +static INLINE unsigned int aom_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define MASKSADMXN_AVX2(m, n) \ + unsigned int aom_masked_sad##m##x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \ + msk, msk_stride, invert_mask, m, n); \ + } + +MASKSADMXN_AVX2(4, 4) +MASKSADMXN_AVX2(4, 8) +MASKSADMXN_AVX2(8, 4) +MASKSADMXN_AVX2(8, 8) +MASKSADMXN_AVX2(8, 16) +MASKSADMXN_AVX2(16, 8) +MASKSADMXN_AVX2(16, 16) +MASKSADMXN_AVX2(16, 32) +MASKSADMXN_AVX2(32, 16) +MASKSADMXN_AVX2(32, 32) +MASKSADMXN_AVX2(32, 64) +MASKSADMXN_AVX2(64, 32) +MASKSADMXN_AVX2(64, 64) +MASKSADMXN_AVX2(64, 128) +MASKSADMXN_AVX2(128, 64) +MASKSADMXN_AVX2(128, 128) +MASKSADMXN_AVX2(4, 16) +MASKSADMXN_AVX2(16, 4) +MASKSADMXN_AVX2(8, 32) +MASKSADMXN_AVX2(32, 8) +MASKSADMXN_AVX2(16, 64) +MASKSADMXN_AVX2(64, 16) + +static INLINE unsigned int highbd_masked_sad8xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + // Zero-extend mask to 16 bits + const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(m_ptr)), + _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)))); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return (sad + 31) >> 6; +} + +static INLINE unsigned int highbd_masked_sad16xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + // Zero-extend mask to 16 bits + const __m256i m = + _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return (sad + 31) >> 6; +} + +static INLINE unsigned int aom_highbd_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define HIGHBD_MASKSADMXN_AVX2(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \ + second_pred8, msk, msk_stride, \ + invert_mask, m, n); \ + } + +HIGHBD_MASKSADMXN_AVX2(4, 4); +HIGHBD_MASKSADMXN_AVX2(4, 8); +HIGHBD_MASKSADMXN_AVX2(8, 4); +HIGHBD_MASKSADMXN_AVX2(8, 8); +HIGHBD_MASKSADMXN_AVX2(8, 16); +HIGHBD_MASKSADMXN_AVX2(16, 8); +HIGHBD_MASKSADMXN_AVX2(16, 16); +HIGHBD_MASKSADMXN_AVX2(16, 32); +HIGHBD_MASKSADMXN_AVX2(32, 16); +HIGHBD_MASKSADMXN_AVX2(32, 32); +HIGHBD_MASKSADMXN_AVX2(32, 64); +HIGHBD_MASKSADMXN_AVX2(64, 32); +HIGHBD_MASKSADMXN_AVX2(64, 64); +HIGHBD_MASKSADMXN_AVX2(64, 128); +HIGHBD_MASKSADMXN_AVX2(128, 64); +HIGHBD_MASKSADMXN_AVX2(128, 128); +HIGHBD_MASKSADMXN_AVX2(4, 16); +HIGHBD_MASKSADMXN_AVX2(16, 4); +HIGHBD_MASKSADMXN_AVX2(8, 32); +HIGHBD_MASKSADMXN_AVX2(32, 8); +HIGHBD_MASKSADMXN_AVX2(16, 64); +HIGHBD_MASKSADMXN_AVX2(64, 16); diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c index 1f42eec2f..493f9bd8f 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -19,6 +19,8 @@ #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" + // For width a multiple of 16 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, @@ -27,16 +29,6 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, const uint8_t *m_ptr, int m_stride, int width, int height); -static INLINE unsigned int masked_sad8xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height); - -static INLINE unsigned int masked_sad4xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height); - #define MASKSADMXN_SSSE3(m, n) \ unsigned int aom_masked_sad##m##x##n##_ssse3( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ @@ -56,11 +48,11 @@ static INLINE unsigned int masked_sad4xh_ssse3( const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ - return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ - second_pred, 8, msk, msk_stride, n); \ + return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 8, msk, msk_stride, n); \ else \ - return masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ - ref_stride, msk, msk_stride, n); \ + return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ + ref_stride, msk, msk_stride, n); \ } #define MASKSAD4XN_SSSE3(n) \ @@ -69,11 +61,11 @@ static INLINE unsigned int masked_sad4xh_ssse3( const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ - return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ - second_pred, 4, msk, msk_stride, n); \ + return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 4, msk, msk_stride, n); \ else \ - return masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ - ref_stride, msk, msk_stride, n); \ + return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ + ref_stride, msk, msk_stride, n); \ } MASKSADMXN_SSSE3(128, 128) @@ -145,10 +137,11 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, return (sad + 31) >> 6; } -static INLINE unsigned int masked_sad8xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); @@ -189,10 +182,11 @@ static INLINE unsigned int masked_sad8xh_ssse3( return (sad + 31) >> 6; } -static INLINE unsigned int masked_sad4xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); @@ -238,11 +232,6 @@ static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height); -static INLINE unsigned int highbd_masked_sad4xh_ssse3( - const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, - int height); - #define HIGHBD_MASKSADMXN_SSSE3(m, n) \ unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ @@ -262,11 +251,13 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3( int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ int msk_stride, int invert_mask) { \ if (!invert_mask) \ - return highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, ref_stride, \ - second_pred8, 4, msk, msk_stride, n); \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \ + ref_stride, second_pred8, 4, msk, \ + msk_stride, n); \ else \ - return highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ - ref8, ref_stride, msk, msk_stride, n); \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ + ref8, ref_stride, msk, msk_stride, \ + n); \ } HIGHBD_MASKSADMXN_SSSE3(128, 128) @@ -350,10 +341,11 @@ static INLINE unsigned int highbd_masked_sad_ssse3( return (sad + 31) >> 6; } -static INLINE unsigned int highbd_masked_sad4xh_ssse3( - const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h new file mode 100644 index 000000000..19b429d91 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H +#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H + +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +#endif diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c new file mode 100644 index 000000000..2aa2a0555 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + do { + const __m128i v_p_b_0 = xx_loadl_32(pre); + const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride); + const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int obmc_sad_w8n_avx2( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_b = xx_loadl_64(pre + n); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if ((n & (width - 1)) == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) +OBMCSADWXH(4, 16) +OBMCSADWXH(16, 4) +OBMCSADWXH(8, 32) +OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + do { + const __m128i v_p_w_0 = xx_loadl_64(pre); + const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride); + const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int hbd_obmc_sad_w8n_avx2( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n)); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +HBD_OBMCSADWXH(4, 16) +HBD_OBMCSADWXH(16, 4) +HBD_OBMCSADWXH(8, 32) +HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 571aa770b..2e2f6e09f 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -26,6 +26,16 @@ // 8 bit //////////////////////////////////////////////////////////////////////////////// +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, @@ -152,6 +162,46 @@ OBMCVARWXH(32, 8) OBMCVARWXH(16, 64) OBMCVARWXH(64, 16) +#include "config/aom_dsp_rtcd.h" + +#define OBMC_SUBPIX_VAR(W, H) \ + uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \ + } + +OBMC_SUBPIX_VAR(128, 128) +OBMC_SUBPIX_VAR(128, 64) +OBMC_SUBPIX_VAR(64, 128) +OBMC_SUBPIX_VAR(64, 64) +OBMC_SUBPIX_VAR(64, 32) +OBMC_SUBPIX_VAR(32, 64) +OBMC_SUBPIX_VAR(32, 32) +OBMC_SUBPIX_VAR(32, 16) +OBMC_SUBPIX_VAR(16, 32) +OBMC_SUBPIX_VAR(16, 16) +OBMC_SUBPIX_VAR(16, 8) +OBMC_SUBPIX_VAR(8, 16) +OBMC_SUBPIX_VAR(8, 8) +OBMC_SUBPIX_VAR(8, 4) +OBMC_SUBPIX_VAR(4, 8) +OBMC_SUBPIX_VAR(4, 4) +OBMC_SUBPIX_VAR(4, 16) +OBMC_SUBPIX_VAR(16, 4) +OBMC_SUBPIX_VAR(8, 32) +OBMC_SUBPIX_VAR(32, 8) +OBMC_SUBPIX_VAR(16, 64) +OBMC_SUBPIX_VAR(64, 16) + //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c new file mode 100644 index 000000000..4389d123d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr)); + __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); + __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); + __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); + __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1); +} + +static INLINE void aom_subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr)); + __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(s); + __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_128xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64); + subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 32: + aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 64: + aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 128: + aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + default: + aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h new file mode 100644 index 000000000..bdff64b8f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_ + +#include <emmintrin.h> +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, + int8_t cos_bit); + +static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1, + __m256i *in0, __m256i *in1, const __m256i _r, + const int32_t cos_bit) { + __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); + __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); + __m256i u0 = _mm256_madd_epi16(t0, w0); + __m256i u1 = _mm256_madd_epi16(t1, w0); + __m256i v0 = _mm256_madd_epi16(t0, w1); + __m256i v1 = _mm256_madd_epi16(t1, w1); + + __m256i a0 = _mm256_add_epi32(u0, _r); + __m256i a1 = _mm256_add_epi32(u1, _r); + __m256i b0 = _mm256_add_epi32(v0, _r); + __m256i b1 = _mm256_add_epi32(v1, _r); + + __m256i c0 = _mm256_srai_epi32(a0, cos_bit); + __m256i c1 = _mm256_srai_epi32(a1, cos_bit); + __m256i d0 = _mm256_srai_epi32(b0, cos_bit); + __m256i d1 = _mm256_srai_epi32(b1, cos_bit); + + *in0 = _mm256_packs_epi32(c0, c1); + *in1 = _mm256_packs_epi32(d0, d1); +} + +static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_adds_epi16(_in0, _in1); + *in1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_add_epi32(_in0, _in1); + *in1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_adds_epi16(_in0, _in1); + *out1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_add_epi32(_in0, _in1); + *out1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in, + int stride, + __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) { + const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a); + const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); + return _mm256_permute4x64_epi64(b, 0xD8); +} + +static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f + // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f + // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f + // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f + // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f + // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f + // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f + // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f + // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // ... + __m256i a[16]; + for (int i = 0; i < 16; i += 2) { + a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]); + a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]); + } + __m256i b[16]; + for (int i = 0; i < 16; i += 2) { + b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]); + b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]); + } + __m256i c[16]; + for (int i = 0; i < 16; i += 2) { + c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]); + c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]); + } + out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20); + out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20); + out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20); + out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20); + + out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31); + out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31); + out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31); + out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31); + + out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20); + out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20); + out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20); + out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20); + + out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31); + out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31); + out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31); + out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31); +} + +static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { + if (bit < 0) { + bit = -bit; + __m256i round = _mm256_set1_epi16(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[i] = _mm256_adds_epi16(in[i], round); + in[i] = _mm256_srai_epi16(in[i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm256_slli_epi16(in[i], bit); + } + } +} + +#ifdef __cplusplus +} +#endif + +#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c index 7d6b7d287..a7ac2c93d 100644 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -324,6 +324,12 @@ static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); } +static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); +} + static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, const __m256i a, uint8_t *comp_pred) { @@ -401,3 +407,110 @@ void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, } while (i < height); } } + +static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, + const __m256i s1, + const __m256i a) { + const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i a_inv = _mm256_sub_epi16(alpha_max, a); + + const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1); + const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv); + const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo); + const __m256i pred_l = _mm256_srai_epi32( + _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1); + const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv); + const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi); + const __m256i pred_h = _mm256_srai_epi32( + _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i comp = _mm256_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m256i zero = _mm256_setzero_si256(); + + if (width == 8) { + do { + const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); + const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); + + const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); + + __m256i m = _mm256_castsi128_si256(m_l); + m = _mm256_insertf128_si256(m, m_h, 1); + const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); + + _mm_storeu_si128((__m128i *)(comp_pred + width), + _mm256_extractf128_si256(comp, 1)); + + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + comp_pred += (width << 1); + i += 2; + } while (i < height); + } else if (width == 16) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); + const __m256i m_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 32) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16)); + + const __m256i m01_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + const __m256i m23_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16))); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); + const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } +} diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c new file mode 100644 index 000000000..66b0d7d84 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow + // in computation using _mm_maddubs_epi16. + // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. + const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; + const __m128i r = _mm_set1_epi16(round); + const uint8_t f0 = filter[0] >> 1; + const uint8_t f1 = filter[1] >> 1; + const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, + f0, f1, f0, f1, f0, f1); + unsigned int i, j; + (void)pixel_step; + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + // load source + __m128i source_low = xx_loadl_64(a); + __m128i source_hi = xx_loadl_64(a + 1); + + // unpack to: + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source = _mm_unpacklo_epi8(source_low, source_hi); + + // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] + __m128i res = _mm_maddubs_epi16(source, filters); + + // round + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storeu_128(b, res); + + a += 8; + b += 8; + } + + a += src_pixels_per_line - output_width; + } + } else { + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + for (i = 0; i < output_height; ++i) { + // load source, only first 5 values are meaningful: + // { a[0], a[1], a[2], a[3], a[4], xxxx } + __m128i source = xx_loadl_64(a); + + // shuffle, up to the first 8 are useful + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + __m128i res = _mm_maddubs_epi16(source_shuffle, filters); + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storel_64(b, res); + + a += src_pixels_per_line; + b += output_width; + } + } +} + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + const int16_t round = (1 << FILTER_BITS) >> 1; + const __m128i r = _mm_set1_epi32(round); + const __m128i filters = + _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], + filter[1], filter[0], filter[1]); + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + const __m128i mask = + _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + // load source as: + // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } + __m128i source1 = xx_loadl_64(a); + __m128i source2 = xx_loadl_64(a + pixel_step); + __m128i source = _mm_unpacklo_epi64(source1, source2); + + // shuffle source to: + // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + // b[i] = a[i] * filter[0] + a[w + i] * filter[1] + __m128i res = _mm_madd_epi16(source_shuffle, filters); + + // round + res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); + + // shuffle to get each lower 8 bit of every 32 bit + res = _mm_shuffle_epi8(res, mask); + + xx_storel_32(b, res); + + a += 4; + b += 4; + } + + a += src_pixels_per_line - output_width; + } +} diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index c8c90a7dc..7e3c5d5db 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -569,7 +569,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, } } - const InterpFilterParams filter = + const InterpFilterParams *filter = av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { @@ -633,12 +633,12 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride, - temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), + aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, width, height); } |