summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c')
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c556
1 files changed, 556 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index f3fe50372..94b5da171 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -74,6 +74,87 @@ static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
_mm256_extractf128_si256(*a, 1));
}
+static void aom_filter_block1d4_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+ filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 4 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+}
+
static void aom_filter_block1d4_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -179,6 +260,100 @@ static void aom_filter_block1d4_h8_avx2(
}
}
+static void aom_filter_block1d8_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 8 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
static void aom_filter_block1d8_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -311,6 +486,121 @@ static void aom_filter_block1d8_h8_avx2(
}
}
+static void aom_filter_block1d16_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 =
+ xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m256i srcReg1, srcReg12;
+ __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
+
+ srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
+ srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
+
+ // filter the source buffer
+ srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
+ srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
+ srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
+ srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
+ srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr,
+ _mm256_castsi256_si128(srcRegFilt1_1));
+ }
+}
+
static void aom_filter_block1d16_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -507,6 +797,92 @@ static void aom_filter_block1d16_h8_avx2(
}
}
+static void aom_filter_block1d8_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i resReg23_34_lo, resReg45_56_lo;
+ __m256i resReglo, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
+}
+
static void aom_filter_block1d8_v8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -659,6 +1035,104 @@ static void aom_filter_block1d8_v8_avx2(
}
}
+static void aom_filter_block1d16_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
+ __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
+ __m256i resReglo, resReghi, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+ srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+ srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
+ resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
+
+ // add and saturate the results together
+ resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+ resReghi = _mm256_srai_epi16(resReghi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReghi);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg23_34_hi = srcReg45_56_hi;
+ srcReg4x = srcReg6x;
+ }
+}
+
static void aom_filter_block1d16_v8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -854,6 +1328,88 @@ static void aom_filter_block1d16_v8_avx2(
}
}
+static void aom_filter_block1d4_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i srcReg2345_3456_lo;
+ __m256i resReglo, resReg;
+ __m256i firstFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+ srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+
+ resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
+}
+
#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction aom_filter_block1d4_v8_ssse3;
filter8_1dfunction aom_filter_block1d16_v2_ssse3;