summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/aom_dsp/x86')
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c619
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_avx2.h32
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_sse2.h2
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c8
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c8
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse2.c8
-rw-r--r--third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c110
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c390
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c70
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h33
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_sad_avx2.c270
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_sse4.c50
-rw-r--r--third_party/aom/aom_dsp/x86/subtract_avx2.c108
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_avx2.h199
-rw-r--r--third_party/aom/aom_dsp/x86/variance_avx2.c113
-rw-r--r--third_party/aom/aom_dsp/x86/variance_impl_ssse3.c129
-rw-r--r--third_party/aom/aom_dsp/x86/variance_sse2.c12
17 files changed, 1835 insertions, 326 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index af45a03ac..f3fe50372 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -41,20 +41,290 @@
#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
#endif // __clang__
+static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
+ const ptrdiff_t stride, const __m256i *a) {
+ *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
+ *((uint32_t *)(output_ptr + stride)) =
+ _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+ __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
+ a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
+ return a;
+}
+
+static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
+ const ptrdiff_t stride, const __m256i *a) {
+ _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+ _mm_storel_epi64((__m128i *)(output_ptr + stride),
+ _mm256_extractf128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+ __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+ a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+ return a;
+}
+
+static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
+ const ptrdiff_t stride, const __m256i *a) {
+ _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+ _mm_store_si128((__m128i *)(output_ptr + stride),
+ _mm256_extractf128_si256(*a, 1));
+}
+
+static void aom_filter_block1d4_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, filt2Reg;
+ __m256i firstFilters, secondFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2;
+ __m256i srcReg32b1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 32 bits
+ firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+ // duplicate only the second 32 bits
+ secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ // filter the source buffer
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 4 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d8_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 8 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
static void aom_filter_block1d16_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
__m128i filtersReg;
- __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m256i firstFilters, secondFilters, thirdFilters, forthFilters;
__m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
__m256i srcReg32b1, srcReg32b2, filtersReg32;
unsigned int i;
ptrdiff_t src_stride, dst_stride;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
@@ -74,22 +344,17 @@ static void aom_filter_block1d16_h8_avx2(
// across 256 bit register
forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
- filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
// multiple the size of the source and destination stride by two
src_stride = src_pixels_per_line << 1;
dst_stride = output_pitch << 1;
for (i = output_height; i > 1; i -= 2) {
// load the 2 strides of source
- srcReg32b1 =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
- srcReg32b1 = _mm256_inserti128_si256(
- srcReg32b1,
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
- 1);
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
// filter the source buffer
srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@@ -110,22 +375,13 @@ static void aom_filter_block1d16_h8_avx2(
srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(
- srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+ __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
// reading 2 strides of the next 16 bytes
// (part of it was being read by earlier read)
srcReg32b2 =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
- srcReg32b2 = _mm256_inserti128_si256(
- srcReg32b2,
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
- 1);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(
- srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+ xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
// filter the source buffer
srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
@@ -148,32 +404,21 @@ static void aom_filter_block1d16_h8_avx2(
// add and saturate the results together
srcRegFilt32b2_1 = _mm256_adds_epi16(
- srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
- srcRegFilt32b2_1 = _mm256_adds_epi16(
- srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
+ srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
- srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
// shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
+ // convolve result and the second lane contain the second convolve result
srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
src_ptr += src_stride;
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr,
- _mm256_castsi256_si128(srcRegFilt32b1_1));
-
- // save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + output_pitch),
- _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+ xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
output_ptr += dst_stride;
}
@@ -183,7 +428,7 @@ static void aom_filter_block1d16_h8_avx2(
__m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
__m128i srcRegFilt2, srcRegFilt3;
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
// filter the source buffer
srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
@@ -210,15 +455,11 @@ static void aom_filter_block1d16_h8_avx2(
// add and saturate the results together
srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+ _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
// reading the next 16 bytes
// (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
- // add and saturate the results together
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
// filter the source buffer
srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
@@ -245,19 +486,16 @@ static void aom_filter_block1d16_h8_avx2(
// add and saturate the results together
srcRegFilt2_1 =
- _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
- srcRegFilt2_1 =
- _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+ _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+ // shift by 6 bit each 16 bit
srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
srcRegFilt2_1 =
- _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
-
- // shift by 7 bit each 16 bit
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
- srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+ _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
// shrink to 8 bit each 16 bits, the first lane contain the first
// convolve result and the second lane contain the second convolve
@@ -269,11 +507,163 @@ static void aom_filter_block1d16_h8_avx2(
}
}
+static void aom_filter_block1d8_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32;
+ __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+ __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+ __m256i srcReg32b11, srcReg32b12, filtersReg32;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ // load 16 bytes 7 times in stride of src_pitch
+ srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
+ srcReg32b3 =
+ xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg32b5 =
+ xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+ srcReg32b7 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+
+ // have each consecutive loads on the same 256 register
+ srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+ srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+ srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+ // merge every two consecutive registers except the last one
+ srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+ srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+ srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg32b8 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
+ srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+ _mm256_castsi256_si128(srcReg32b8), 1);
+ srcReg32b9 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
+ srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+ _mm256_castsi256_si128(srcReg32b9), 1);
+
+ // merge every two consecutive registers
+ // save
+ srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+ // shift by 6 bit each 16 bit
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+ srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg32b10 = srcReg32b11;
+ srcReg32b11 = srcReg32b2;
+ srcReg32b2 = srcReg32b4;
+ srcReg32b7 = srcReg32b9;
+ }
+ if (i > 0) {
+ __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
+ // load the last 16 bytes
+ srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // merge the last 2 results together
+ srcRegFilt4 =
+ _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt4 =
+ _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+ _mm256_castsi256_si128(secondFilters));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
+ }
+}
+
static void aom_filter_block1d16_v8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
__m128i filtersReg;
- __m256i addFilterReg64;
+ __m256i addFilterReg32;
__m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
__m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
__m256i srcReg32b11, srcReg32b12, filtersReg32;
@@ -281,11 +671,11 @@ static void aom_filter_block1d16_v8_avx2(
unsigned int i;
ptrdiff_t src_stride, dst_stride;
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+ addFilterReg32 = _mm256_set1_epi16(32);
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the
// same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
// have the same data in both lanes of a 256 bit register
filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
@@ -308,49 +698,26 @@ static void aom_filter_block1d16_v8_avx2(
dst_stride = out_pitch << 1;
// load 16 bytes 7 times in stride of src_pitch
- srcReg32b1 =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
- srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
- srcReg32b3 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
- srcReg32b4 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
- srcReg32b5 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
- srcReg32b6 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
+ srcReg32b3 =
+ xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg32b5 =
+ xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
srcReg32b7 = _mm256_castsi128_si256(
_mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
// have each consecutive loads on the same 256 register
- srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
- _mm256_castsi256_si128(srcReg32b2), 1);
- srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
- _mm256_castsi256_si128(srcReg32b3), 1);
- srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
- _mm256_castsi256_si128(srcReg32b4), 1);
- srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
- _mm256_castsi256_si128(srcReg32b5), 1);
- srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
- _mm256_castsi256_si128(srcReg32b6), 1);
- srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
- _mm256_castsi256_si128(srcReg32b7), 1);
-
+ srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+ srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+ srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
// merge every two consecutive registers except the last one
srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
// save
srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-
- // save
srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
- // save
srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
- // save
srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
for (i = output_height; i > 1; i -= 2) {
@@ -383,9 +750,7 @@ static void aom_filter_block1d16_v8_avx2(
// add and saturate the results together
srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_max_epi16(srcReg32b8, srcReg32b12));
+ _mm256_adds_epi16(srcReg32b8, srcReg32b12));
// multiply 2 adjacent elements with the filter and add the result
srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
@@ -399,16 +764,13 @@ static void aom_filter_block1d16_v8_avx2(
// add and saturate the results together
srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+ _mm256_adds_epi16(srcReg32b8, srcReg32b12));
- // shift by 7 bit each 16 bit
- srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
- srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+ // shift by 6 bit each 16 bit
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
+ srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+ srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
// shrink to 8 bit each 16 bits, the first lane contain the first
// convolve result and the second lane contain the second convolve
@@ -417,12 +779,7 @@ static void aom_filter_block1d16_v8_avx2(
src_ptr += src_stride;
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
-
- // save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + out_pitch),
- _mm256_extractf128_si256(srcReg32b1, 1));
+ xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
output_ptr += dst_stride;
@@ -475,24 +832,17 @@ static void aom_filter_block1d16_v8_avx2(
// add and saturate the results together
srcRegFilt1 =
- _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+ _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
srcRegFilt3 =
- _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+ _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
- // add and saturate the results together
+ // shift by 6 bit each 16 bit
srcRegFilt1 =
- _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+ _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
srcRegFilt3 =
- _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
- srcRegFilt1 =
- _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
- srcRegFilt3 =
- _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
- srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+ _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+ srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
// shrink to 8 bit each 16 bits, the first lane contain the first
// convolve result and the second lane contain the second convolve
@@ -506,21 +856,6 @@ static void aom_filter_block1d16_v8_avx2(
#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
-filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
-#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3
-#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3
-#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3
-#else // ARCH_X86
-filter8_1dfunction aom_filter_block1d8_v8_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3
-#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3
-#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3
-#endif // ARCH_X86_64
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
filter8_1dfunction aom_filter_block1d16_h2_ssse3;
filter8_1dfunction aom_filter_block1d8_v2_ssse3;
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
index 7790baf2e..72fabd236 100644
--- a/third_party/aom/aom_dsp/x86/convolve_avx2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -13,31 +13,27 @@
#define AOM_DSP_X86_CONVOLVE_AVX2_H_
// filters for 16
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+ 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+ 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+ 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7,
+ 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
};
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2,
+ 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9,
+ 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
};
static INLINE void prepare_coeffs_lowbd(
const InterpFilterParams *const filter_params, const int subpel_q4,
__m256i *const coeffs /* [4] */) {
const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
- *filter_params, subpel_q4 & SUBPEL_MASK);
+ filter_params, subpel_q4 & SUBPEL_MASK);
const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
@@ -65,7 +61,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
const int subpel_q4,
__m256i *const coeffs /* [4] */) {
const int16_t *filter = av1_get_interp_filter_subpel_kernel(
- *filter_params, subpel_q4 & SUBPEL_MASK);
+ filter_params, subpel_q4 & SUBPEL_MASK);
const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
index 846fe7bb4..399df5d6d 100644
--- a/third_party/aom/aom_dsp/x86/convolve_sse2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -19,7 +19,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
const int subpel_q4,
__m128i *const coeffs /* [4] */) {
const int16_t *filter = av1_get_interp_filter_subpel_kernel(
- *filter_params, subpel_q4 & SUBPEL_MASK);
+ filter_params, subpel_q4 & SUBPEL_MASK);
const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
// coeffs 0 1 0 1 0 1 0 1
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
index e5e3238d5..099fcf7fc 100644
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@@ -105,8 +105,8 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
int i, j;
@@ -254,8 +254,8 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
int i, j;
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
index f7ac9b496..e7b33d1c4 100644
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -18,8 +18,8 @@
void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4,
const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
@@ -166,8 +166,8 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4,
const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index fdfadc886..131c16aa9 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -676,7 +676,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
}
}
- const InterpFilterParams filter =
+ const InterpFilterParams *filter =
av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
if (!subpel_x_q3 && !subpel_y_q3) {
@@ -726,14 +726,14 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
const int16_t *const kernel_y =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
const int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+ aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
ref_stride, CONVERT_TO_BYTEPTR(temp),
MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
intermediate_height, bd);
aom_highbd_convolve8_vert(
- CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+ CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
16, width, height, bd);
}
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
index 9801e285c..eaf1f347b 100644
--- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -22,118 +22,12 @@
void aom_var_filter_block2d_bil_first_pass_ssse3(
const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
unsigned int pixel_step, unsigned int output_height,
- unsigned int output_width, const uint8_t *filter) {
- // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
- // in computation using _mm_maddubs_epi16.
- // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
- const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
- const __m128i r = _mm_set1_epi16(round);
- const uint8_t f0 = filter[0] >> 1;
- const uint8_t f1 = filter[1] >> 1;
- const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
- f0, f1, f0, f1, f0, f1);
- const __m128i shuffle_mask =
- _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
- unsigned int i, j;
- (void)pixel_step;
-
- if (output_width >= 8) {
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; j += 8) {
- // load source
- __m128i source_low = xx_loadl_64(a);
- __m128i source_hi = _mm_setzero_si128();
-
- // avoid load undefined memory
- if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8);
- __m128i source = _mm_unpacklo_epi64(source_low, source_hi);
-
- // shuffle to:
- // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
- // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
- __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
- // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
- __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
-
- // round
- res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
- xx_storeu_128(b, res);
-
- a += 8;
- b += 8;
- }
-
- a += src_pixels_per_line - output_width;
- }
- } else {
- for (i = 0; i < output_height; ++i) {
- // load source, only first 5 values are meaningful:
- // { a[0], a[1], a[2], a[3], a[4], xxxx }
- __m128i source = xx_loadl_64(a);
-
- // shuffle, up to the first 8 are useful
- // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
- // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
- __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
- __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
- res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
- xx_storel_64(b, res);
-
- a += src_pixels_per_line;
- b += output_width;
- }
- }
-}
+ unsigned int output_width, const uint8_t *filter);
void aom_var_filter_block2d_bil_second_pass_ssse3(
const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
unsigned int pixel_step, unsigned int output_height,
- unsigned int output_width, const uint8_t *filter) {
- const int16_t round = (1 << FILTER_BITS) >> 1;
- const __m128i r = _mm_set1_epi32(round);
- const __m128i filters =
- _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
- filter[1], filter[0], filter[1]);
- const __m128i shuffle_mask =
- _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
- const __m128i mask =
- _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
- unsigned int i, j;
-
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; j += 4) {
- // load source as:
- // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
- __m128i source1 = xx_loadl_64(a);
- __m128i source2 = xx_loadl_64(a + pixel_step);
- __m128i source = _mm_unpacklo_epi64(source1, source2);
-
- // shuffle source to:
- // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
- __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
- // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
- __m128i res = _mm_madd_epi16(source_shuffle, filters);
-
- // round
- res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
-
- // shuffle to get each lower 8 bit of every 32 bit
- res = _mm_shuffle_epi8(res, mask);
-
- xx_storel_32(b, res);
-
- a += 4;
- b += 4;
- }
-
- a += src_pixels_per_line - output_width;
- }
-}
+ unsigned int output_width, const uint8_t *filter);
static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
const __m128i *w, const __m128i *r,
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
new file mode 100644
index 000000000..6538e4d5e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+
+static INLINE unsigned int masked_sad32xh_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ int x, y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_scale =
+ _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 32) {
+ const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+ const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+ const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+ const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
+ const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+ // Calculate 16 predicted pixels.
+ // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+ // is 64 * 255, so we have plenty of space to add rounding constants.
+ const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+ __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+ pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+ const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+ __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+ pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+ const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+ res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+ res = _mm256_shuffle_epi32(res, 0xd8);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int32_t sad = _mm256_extract_epi32(res, 0);
+ return (sad + 31) >> 6;
+}
+
+static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
+ __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
+ __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
+ __m256i a = _mm256_castsi128_si256(a0);
+ return _mm256_inserti128_si256(a, a1, 1);
+}
+
+static INLINE unsigned int masked_sad16xh_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int height) {
+ int y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_scale =
+ _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ for (y = 0; y < height; y += 2) {
+ const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+ const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+ const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+ const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
+ const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+ // Calculate 16 predicted pixels.
+ // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+ // is 64 * 255, so we have plenty of space to add rounding constants.
+ const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+ __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+ pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+ const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+ __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+ pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+ const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+ res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+
+ src_ptr += src_stride << 1;
+ a_ptr += a_stride << 1;
+ b_ptr += b_stride << 1;
+ m_ptr += m_stride << 1;
+ }
+ // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+ res = _mm256_shuffle_epi32(res, 0xd8);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int32_t sad = _mm256_extract_epi32(res, 0);
+ return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int aom_masked_sad_avx2(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+ int invert_mask, int m, int n) {
+ unsigned int sad;
+ if (!invert_mask) {
+ switch (m) {
+ case 4:
+ sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ case 16:
+ sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+ m, msk, msk_stride, n);
+ break;
+ default:
+ sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+ m, msk, msk_stride, m, n);
+ break;
+ }
+ } else {
+ switch (m) {
+ case 4:
+ sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ case 16:
+ sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ default:
+ sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, m, n);
+ break;
+ }
+ }
+ return sad;
+}
+
+#define MASKSADMXN_AVX2(m, n) \
+ unsigned int aom_masked_sad##m##x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
+ msk, msk_stride, invert_mask, m, n); \
+ }
+
+MASKSADMXN_AVX2(4, 4)
+MASKSADMXN_AVX2(4, 8)
+MASKSADMXN_AVX2(8, 4)
+MASKSADMXN_AVX2(8, 8)
+MASKSADMXN_AVX2(8, 16)
+MASKSADMXN_AVX2(16, 8)
+MASKSADMXN_AVX2(16, 16)
+MASKSADMXN_AVX2(16, 32)
+MASKSADMXN_AVX2(32, 16)
+MASKSADMXN_AVX2(32, 32)
+MASKSADMXN_AVX2(32, 64)
+MASKSADMXN_AVX2(64, 32)
+MASKSADMXN_AVX2(64, 64)
+MASKSADMXN_AVX2(64, 128)
+MASKSADMXN_AVX2(128, 64)
+MASKSADMXN_AVX2(128, 128)
+MASKSADMXN_AVX2(4, 16)
+MASKSADMXN_AVX2(16, 4)
+MASKSADMXN_AVX2(8, 32)
+MASKSADMXN_AVX2(32, 8)
+MASKSADMXN_AVX2(16, 64)
+MASKSADMXN_AVX2(64, 16)
+
+static INLINE unsigned int highbd_masked_sad8xh_avx2(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int height) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+ int y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ for (y = 0; y < height; y += 2) {
+ const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+ const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+ const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+ // Zero-extend mask to 16 bits
+ const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(m_ptr)),
+ _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
+ const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+ const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+ __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+ pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+ __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+ pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+ // so it is safe to do signed saturation here.
+ const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+ // There is no 16-bit SAD instruction, so we have to synthesize
+ // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+ // and accumulating them at the end
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+ res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+
+ src_ptr += src_stride << 1;
+ a_ptr += a_stride << 1;
+ b_ptr += b_stride << 1;
+ m_ptr += m_stride << 1;
+ }
+ // At this point, we have four 32-bit partial SADs stored in 'res'.
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+ return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int highbd_masked_sad16xh_avx2(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+ int x, y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 16) {
+ const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+ const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+ const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+ // Zero-extend mask to 16 bits
+ const __m256i m =
+ _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
+ const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+ const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+ __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+ pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+ __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+ pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+ // so it is safe to do signed saturation here.
+ const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+ // There is no 16-bit SAD instruction, so we have to synthesize
+ // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+ // and accumulating them at the end
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+ res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // At this point, we have four 32-bit partial SADs stored in 'res'.
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+ return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int aom_highbd_masked_sad_avx2(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+ int invert_mask, int m, int n) {
+ unsigned int sad;
+ if (!invert_mask) {
+ switch (m) {
+ case 4:
+ sad =
+ aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ default:
+ sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, m, n);
+ break;
+ }
+ } else {
+ switch (m) {
+ case 4:
+ sad =
+ aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ default:
+ sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, m, n);
+ break;
+ }
+ }
+ return sad;
+}
+
+#define HIGHBD_MASKSADMXN_AVX2(m, n) \
+ unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
+ int msk_stride, int invert_mask) { \
+ return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
+ second_pred8, msk, msk_stride, \
+ invert_mask, m, n); \
+ }
+
+HIGHBD_MASKSADMXN_AVX2(4, 4);
+HIGHBD_MASKSADMXN_AVX2(4, 8);
+HIGHBD_MASKSADMXN_AVX2(8, 4);
+HIGHBD_MASKSADMXN_AVX2(8, 8);
+HIGHBD_MASKSADMXN_AVX2(8, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 8);
+HIGHBD_MASKSADMXN_AVX2(16, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 16);
+HIGHBD_MASKSADMXN_AVX2(32, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 32);
+HIGHBD_MASKSADMXN_AVX2(64, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 128);
+HIGHBD_MASKSADMXN_AVX2(128, 64);
+HIGHBD_MASKSADMXN_AVX2(128, 128);
+HIGHBD_MASKSADMXN_AVX2(4, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 4);
+HIGHBD_MASKSADMXN_AVX2(8, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 8);
+HIGHBD_MASKSADMXN_AVX2(16, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 16);
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
index 1f42eec2f..493f9bd8f 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -19,6 +19,8 @@
#include "aom/aom_integer.h"
#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+
// For width a multiple of 16
static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
int src_stride,
@@ -27,16 +29,6 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
const uint8_t *m_ptr, int m_stride,
int width, int height);
-static INLINE unsigned int masked_sad8xh_ssse3(
- const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
- int height);
-
-static INLINE unsigned int masked_sad4xh_ssse3(
- const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
- int height);
-
#define MASKSADMXN_SSSE3(m, n) \
unsigned int aom_masked_sad##m##x##n##_ssse3( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
@@ -56,11 +48,11 @@ static INLINE unsigned int masked_sad4xh_ssse3(
const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
int invert_mask) { \
if (!invert_mask) \
- return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \
- second_pred, 8, msk, msk_stride, n); \
+ return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \
+ second_pred, 8, msk, msk_stride, n); \
else \
- return masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \
- ref_stride, msk, msk_stride, n); \
+ return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \
+ ref_stride, msk, msk_stride, n); \
}
#define MASKSAD4XN_SSSE3(n) \
@@ -69,11 +61,11 @@ static INLINE unsigned int masked_sad4xh_ssse3(
const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
int invert_mask) { \
if (!invert_mask) \
- return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \
- second_pred, 4, msk, msk_stride, n); \
+ return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \
+ second_pred, 4, msk, msk_stride, n); \
else \
- return masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \
- ref_stride, msk, msk_stride, n); \
+ return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \
+ ref_stride, msk, msk_stride, n); \
}
MASKSADMXN_SSSE3(128, 128)
@@ -145,10 +137,11 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
return (sad + 31) >> 6;
}
-static INLINE unsigned int masked_sad8xh_ssse3(
- const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
- int height) {
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height) {
int y;
__m128i res = _mm_setzero_si128();
const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
@@ -189,10 +182,11 @@ static INLINE unsigned int masked_sad8xh_ssse3(
return (sad + 31) >> 6;
}
-static INLINE unsigned int masked_sad4xh_ssse3(
- const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
- int height) {
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height) {
int y;
__m128i res = _mm_setzero_si128();
const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
@@ -238,11 +232,6 @@ static INLINE unsigned int highbd_masked_sad_ssse3(
const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
int width, int height);
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
- const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
- int height);
-
#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
const uint8_t *src8, int src_stride, const uint8_t *ref8, \
@@ -262,11 +251,13 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(
int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
int msk_stride, int invert_mask) { \
if (!invert_mask) \
- return highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, ref_stride, \
- second_pred8, 4, msk, msk_stride, n); \
+ return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \
+ ref_stride, second_pred8, 4, msk, \
+ msk_stride, n); \
else \
- return highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
- ref8, ref_stride, msk, msk_stride, n); \
+ return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
+ ref8, ref_stride, msk, msk_stride, \
+ n); \
}
HIGHBD_MASKSADMXN_SSSE3(128, 128)
@@ -350,10 +341,11 @@ static INLINE unsigned int highbd_masked_sad_ssse3(
return (sad + 31) >> 6;
}
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
- const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
- int height) {
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+ const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height) {
const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
new file mode 100644
index 000000000..19b429d91
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H
+#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height);
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height);
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+ const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height);
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
new file mode 100644
index 000000000..2aa2a0555
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+ do {
+ const __m128i v_p_b_0 = xx_loadl_32(pre);
+ const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
+ const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
+ const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+ const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+ const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+ const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+ n += 8;
+ pre += pre_stride << 1;
+ } while (n < 8 * (height >> 1));
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int obmc_sad_w8n_avx2(
+ const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, const int width, const int height) {
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p0_b = xx_loadl_64(pre + n);
+ const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+ const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+ // Rounded absolute difference
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+ const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+ n += 8;
+
+ if ((n & (width - 1)) == 0) pre += pre_step;
+ } while (n < width * height);
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define OBMCSADWXH(w, h) \
+ unsigned int aom_obmc_sad##w##x##h##_avx2( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *msk) { \
+ if (w == 4) { \
+ return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \
+ } else { \
+ return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
+ } \
+ }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ do {
+ const __m128i v_p_w_0 = xx_loadl_64(pre);
+ const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
+ const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
+ const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+ const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+ const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+ const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+ n += 8;
+
+ pre += pre_stride << 1;
+ } while (n < 8 * (height >> 1));
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, const int width, const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
+ const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+ const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+ // Rounded absolute difference
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+ const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+ n += 8;
+
+ if (n % width == 0) pre += pre_step;
+ } while (n < width * height);
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define HBD_OBMCSADWXH(w, h) \
+ unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ if (w == 4) { \
+ return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \
+ } else { \
+ return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
+ } \
+ }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
index 571aa770b..2e2f6e09f 100644
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -26,6 +26,16 @@
// 8 bit
////////////////////////////////////////////////////////////////////////////////
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+ const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+ const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter);
+
static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
const int32_t *wsrc, const int32_t *mask,
unsigned int *const sse, int *const sum,
@@ -152,6 +162,46 @@ OBMCVARWXH(32, 8)
OBMCVARWXH(16, 64)
OBMCVARWXH(64, 16)
+#include "config/aom_dsp_rtcd.h"
+
+#define OBMC_SUBPIX_VAR(W, H) \
+ uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ aom_var_filter_block2d_bil_first_pass_ssse3( \
+ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_ssse3( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \
+ }
+
+OBMC_SUBPIX_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 64)
+OBMC_SUBPIX_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 32)
+OBMC_SUBPIX_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 16)
+OBMC_SUBPIX_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 8)
+OBMC_SUBPIX_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 4)
+OBMC_SUBPIX_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_SUBPIX_VAR(64, 16)
+
////////////////////////////////////////////////////////////////////////////////
// High bit-depth
////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c
new file mode 100644
index 000000000..4389d123d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
+ const uint8_t *pred_ptr) {
+ __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
+ __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
+ __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+ __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+ __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+ __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+ _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+ _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static INLINE void aom_subtract_block_16xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
+ __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
+ __m256i s_0 = _mm256_cvtepu8_epi16(s);
+ __m256i p_0 = _mm256_cvtepu8_epi16(p);
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static INLINE void aom_subtract_block_32xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static INLINE void aom_subtract_block_64xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static INLINE void aom_subtract_block_128xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
+ subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ switch (cols) {
+ case 16:
+ aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ case 32:
+ aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ case 64:
+ aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ case 128:
+ aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ default:
+ aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
new file mode 100644
index 000000000..bdff64b8f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+ int8_t cos_bit);
+
+static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
+ return _mm256_set1_epi32(
+ (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
+ __m256i *in0, __m256i *in1, const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+ __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+ __m256i u0 = _mm256_madd_epi16(t0, w0);
+ __m256i u1 = _mm256_madd_epi16(t1, w0);
+ __m256i v0 = _mm256_madd_epi16(t0, w1);
+ __m256i v1 = _mm256_madd_epi16(t1, w1);
+
+ __m256i a0 = _mm256_add_epi32(u0, _r);
+ __m256i a1 = _mm256_add_epi32(u1, _r);
+ __m256i b0 = _mm256_add_epi32(v0, _r);
+ __m256i b1 = _mm256_add_epi32(v1, _r);
+
+ __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
+ __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
+ __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
+ __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
+
+ *in0 = _mm256_packs_epi32(c0, c1);
+ *in1 = _mm256_packs_epi32(d0, d1);
+}
+
+static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
+ const __m256i _in0 = *in0;
+ const __m256i _in1 = *in1;
+ *in0 = _mm256_adds_epi16(_in0, _in1);
+ *in1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
+ const __m256i _in0 = *in0;
+ const __m256i _in1 = *in1;
+ *in0 = _mm256_add_epi32(_in0, _in1);
+ *in1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
+ __m256i in0, __m256i in1) {
+ const __m256i _in0 = in0;
+ const __m256i _in1 = in1;
+ *out0 = _mm256_adds_epi16(_in0, _in1);
+ *out1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
+ __m256i in0, __m256i in1) {
+ const __m256i _in0 = in0;
+ const __m256i _in1 = in1;
+ *out0 = _mm256_add_epi32(_in0, _in1);
+ *out1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
+ return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+ int stride, __m256i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_16bit_to_16bit_avx2(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
+ int stride,
+ __m256i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
+ }
+}
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+ const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+ const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+ return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+ int stride, __m256i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+ }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+ __m256i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f
+ // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f
+ // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f
+ // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f
+ // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f
+ // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f
+ // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f
+ // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f
+ // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f
+ // to:
+ // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ // ...
+ __m256i a[16];
+ for (int i = 0; i < 16; i += 2) {
+ a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
+ a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
+ }
+ __m256i b[16];
+ for (int i = 0; i < 16; i += 2) {
+ b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
+ b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
+ }
+ __m256i c[16];
+ for (int i = 0; i < 16; i += 2) {
+ c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
+ c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
+ }
+ out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
+ out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
+ out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
+ out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
+
+ out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
+ out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
+ out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
+ out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
+
+ out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
+ out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
+ out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
+ out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
+
+ out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
+ out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
+ out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
+ out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
+}
+
+static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+ if (bit < 0) {
+ bit = -bit;
+ __m256i round = _mm256_set1_epi16(1 << (bit - 1));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm256_adds_epi16(in[i], round);
+ in[i] = _mm256_srai_epi16(in[i], bit);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm256_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
index 7d6b7d287..a7ac2c93d 100644
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -324,6 +324,12 @@ static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
}
+static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
+ const __m256i d =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+ return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
const __m256i a,
uint8_t *comp_pred) {
@@ -401,3 +407,110 @@ void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
} while (i < height);
}
}
+
+static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
+ const __m256i s1,
+ const __m256i a) {
+ const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
+
+ const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
+ const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
+ const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
+ const __m256i pred_l = _mm256_srai_epi32(
+ _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
+ const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
+ const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
+ const __m256i pred_h = _mm256_srai_epi32(
+ _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
+
+ return comp;
+}
+
+void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ int i = 0;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ const __m256i zero = _mm256_setzero_si256();
+
+ if (width == 8) {
+ do {
+ const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
+ const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
+
+ const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
+ const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
+
+ __m256i m = _mm256_castsi128_si256(m_l);
+ m = _mm256_insertf128_si256(m, m_h, 1);
+ const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
+
+ const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+ _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
+
+ _mm_storeu_si128((__m128i *)(comp_pred + width),
+ _mm256_extractf128_si256(comp, 1));
+
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ comp_pred += (width << 1);
+ i += 2;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
+ const __m256i m_16 =
+ _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+
+ const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+ _mm256_storeu_si256((__m256i *)comp_pred, comp);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else if (width == 32) {
+ do {
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
+ const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
+ const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));
+
+ const __m256i m01_16 =
+ _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+ const __m256i m23_16 =
+ _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));
+
+ const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
+ const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
+
+ _mm256_storeu_si256((__m256i *)comp_pred, comp);
+ _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
new file mode 100644
index 000000000..66b0d7d84
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+ const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter) {
+ // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+ // in computation using _mm_maddubs_epi16.
+ // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+ const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+ const __m128i r = _mm_set1_epi16(round);
+ const uint8_t f0 = filter[0] >> 1;
+ const uint8_t f1 = filter[1] >> 1;
+ const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+ f0, f1, f0, f1, f0, f1);
+ unsigned int i, j;
+ (void)pixel_step;
+
+ if (output_width >= 8) {
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 8) {
+ // load source
+ __m128i source_low = xx_loadl_64(a);
+ __m128i source_hi = xx_loadl_64(a + 1);
+
+ // unpack to:
+ // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+ // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+ __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
+
+ // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+ __m128i res = _mm_maddubs_epi16(source, filters);
+
+ // round
+ res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+ xx_storeu_128(b, res);
+
+ a += 8;
+ b += 8;
+ }
+
+ a += src_pixels_per_line - output_width;
+ }
+ } else {
+ const __m128i shuffle_mask =
+ _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ for (i = 0; i < output_height; ++i) {
+ // load source, only first 5 values are meaningful:
+ // { a[0], a[1], a[2], a[3], a[4], xxxx }
+ __m128i source = xx_loadl_64(a);
+
+ // shuffle, up to the first 8 are useful
+ // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+ // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+ __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+ __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+ res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+ xx_storel_64(b, res);
+
+ a += src_pixels_per_line;
+ b += output_width;
+ }
+ }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+ const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter) {
+ const int16_t round = (1 << FILTER_BITS) >> 1;
+ const __m128i r = _mm_set1_epi32(round);
+ const __m128i filters =
+ _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+ filter[1], filter[0], filter[1]);
+ const __m128i shuffle_mask =
+ _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+ const __m128i mask =
+ _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 4) {
+ // load source as:
+ // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+ __m128i source1 = xx_loadl_64(a);
+ __m128i source2 = xx_loadl_64(a + pixel_step);
+ __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+ // shuffle source to:
+ // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+ __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+ // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+ __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+ // round
+ res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+ // shuffle to get each lower 8 bit of every 32 bit
+ res = _mm_shuffle_epi8(res, mask);
+
+ xx_storel_32(b, res);
+
+ a += 4;
+ b += 4;
+ }
+
+ a += src_pixels_per_line - output_width;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index c8c90a7dc..7e3c5d5db 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -569,7 +569,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
}
}
- const InterpFilterParams filter =
+ const InterpFilterParams *filter =
av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
if (!subpel_x_q3 && !subpel_y_q3) {
@@ -633,12 +633,12 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
const int16_t *const kernel_y =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
const int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
- temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
- intermediate_height);
- aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+ aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
+ ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+ width, intermediate_height);
+ aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
width, height);
}