51 files changed, 5060 insertions, 2751 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
index 401fbdc48..5f5bf5f14 100644
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -22,6 +22,13 @@ filter8_1dfunction aom_filter_block1d8_h8_sse2;
 filter8_1dfunction aom_filter_block1d4_v8_sse2;
 filter8_1dfunction aom_filter_block1d4_h8_sse2;
 
+#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2
+#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2
+#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2
+#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2
+#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2
+#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2
+
 filter8_1dfunction aom_filter_block1d16_v2_sse2;
 filter8_1dfunction aom_filter_block1d16_h2_sse2;
 filter8_1dfunction aom_filter_block1d8_v2_sse2;
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index f3fe50372..94b5da171 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -74,6 +74,87 @@ static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
                   _mm256_extractf128_si256(*a, 1));
 }
 
+static void aom_filter_block1d4_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  firstFilters =
+      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+  filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    srcRegFilt32b1_1 =
+        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 4 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 4 bytes
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+  }
+}
+
 static void aom_filter_block1d4_h8_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -179,6 +260,100 @@ static void aom_filter_block1d4_h8_avx2(
   }
 }
 
+static void aom_filter_block1d8_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt2Reg, filt3Reg;
+  __m256i secondFilters, thirdFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 8 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
 static void aom_filter_block1d8_h8_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -311,6 +486,121 @@ static void aom_filter_block1d8_h8_avx2(
   }
 }
 
+static void aom_filter_block1d16_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt2Reg, filt3Reg;
+  __m256i secondFilters, thirdFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 =
+        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m256i srcReg1, srcReg12;
+    __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
+
+    srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
+    srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
+
+    // filter the source buffer
+    srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
+    srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
+    srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
+    srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
+    srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr,
+                    _mm256_castsi256_si128(srcRegFilt1_1));
+  }
+}
+
 static void aom_filter_block1d16_h8_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -507,6 +797,92 @@ static void aom_filter_block1d16_h8_avx2(
   }
 }
 
+static void aom_filter_block1d8_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg45_56_lo;
+  __m256i resReg23_34_lo, resReg45_56_lo;
+  __m256i resReglo, resReg;
+  __m256i secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4x = srcReg6x;
+  }
+}
+
 static void aom_filter_block1d8_v8_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -659,6 +1035,104 @@ static void aom_filter_block1d8_v8_avx2(
   }
 }
 
+static void aom_filter_block1d16_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
+  __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
+  __m256i resReglo, resReghi, resReg;
+  __m256i secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+  srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+    srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
+    resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
+
+    // add and saturate the results together
+    resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+    resReghi = _mm256_srai_epi16(resReghi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReghi);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg23_34_hi = srcReg45_56_hi;
+    srcReg4x = srcReg6x;
+  }
+}
+
 static void aom_filter_block1d16_v8_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -854,6 +1328,88 @@ static void aom_filter_block1d16_v8_avx2(
   }
 }
 
+static void aom_filter_block1d4_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg45_56_lo;
+  __m256i srcReg2345_3456_lo;
+  __m256i resReglo, resReg;
+  __m256i firstFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  firstFilters =
+      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+    srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+
+    resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4x = srcReg6x;
+  }
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 6bcb4a512..325a21b76 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -287,6 +287,13 @@ filter8_1dfunction aom_filter_block1d8_h8_ssse3;
 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
 filter8_1dfunction aom_filter_block1d4_h8_ssse3;
 
+#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
+#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
+#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3
+#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3
+#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3
+#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3
+
 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
 filter8_1dfunction aom_filter_block1d8_v2_ssse3;
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
new file mode 100644
index 000000000..67fb4d32b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -0,0 +1,900 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+#include <immintrin.h>  // AVX2
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w16_avx2(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
+    int shift) {
+  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+  const __m256i s0_0 = yy_loadu_256(src0);
+  const __m256i s1_0 = yy_loadu_256(src1);
+  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
+  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
+  res0_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+  __m256i res = _mm256_packus_epi16(res0, res0);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
+}
+
+static INLINE void blend_a64_d16_mask_w32_avx2(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
+    const __m256i *v_maxval, int shift) {
+  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+  const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
+  const __m256i s0_0 = yy_loadu_256(src0);
+  const __m256i s0_1 = yy_loadu_256(src0 + 16);
+  const __m256i s1_0 = yy_loadu_256(src1);
+  const __m256i s1_1 = yy_loadu_256(src1 + 16);
+  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
+  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
+  __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
+                                      _mm256_unpacklo_epi16(*m1, max_minus_m1));
+  __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
+                                      _mm256_unpackhi_epi16(*m1, max_minus_m1));
+  res0_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+  res1_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
+  res1_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
+  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+  const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
+  __m256i res = _mm256_packus_epi16(res0, res1);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  _mm256_storeu_si256((__m256i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m = xx_loadu_128(mask);
+    const __m256i m0 = _mm256_cvtepu8_epi16(m);
+
+    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m = yy_loadu_256(mask + j);
+      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
+      const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m256i m_i00 = yy_loadu_256(mask);
+    const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
+
+    const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+    const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+    const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+
+    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
+      const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
+
+      const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+      const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
+      const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+      const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
+      const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+      const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+
+      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+      const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
+      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+      const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + j);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+      const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
+
+      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + j);
+      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
+
+      const __m256i m_ac =
+          _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
+      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
+      const __m256i m1 =
+          _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+void aom_lowbd_blend_a64_d16_mask_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  const int round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+  const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  }
+}
+
+static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+                                       const __m256i *v_m0_b,
+                                       const __m256i *v_m1_b,
+                                       const int32_t bits) {
+  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
+  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
+  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
+  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
+
+  const __m256i v_p0_w =
+      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
+                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
+  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
+  return v_res;
+}
+
+static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+                                       const __m256i *v_m0_b,
+                                       const __m256i *v_m1_b,
+                                       const int32_t bits) {
+  const __m256i v_s0_b = yy_loadu_256(src0);
+  const __m256i v_s1_b = yy_loadu_256(src1);
+
+  const __m256i v_p0_w =
+      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
+                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+  const __m256i v_p1_w =
+      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
+                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
+  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
+  return v_res;
+}
+
+static INLINE void blend_a64_mask_sx_sy_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    const __m256i v_ral_b = yy_loadu_256(mask);
+    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
+    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+    const __m256i v_rvsbl_w =
+        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+
+    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
+    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
+    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+                                             AOM_BLEND_A64_ROUND_BITS);
+
+    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
+      const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
+      const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
+      const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
+      const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+      const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
+      const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+      const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
+      const __m256i v_rvsbl_w =
+          _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+      const __m256i v_rvsbh_w =
+          _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
+      const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
+      const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
+      const __m256i v_m0_b =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_ra_b = xx_loadl_64(mask);
+        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_ra_b = xx_loadu_128(mask);
+        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h);
+      break;
+  }
+}
+
+static INLINE void blend_a64_mask_sx_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
+  do {
+    const __m256i v_rl_b = yy_loadu_256(mask);
+    const __m256i v_al_b =
+        _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
+
+    const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
+    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
+    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+                                             AOM_BLEND_A64_ROUND_BITS);
+
+    xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
+      const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
+      const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
+      const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
+      const __m256i v_al_b =
+          _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
+      const __m256i v_ah_b =
+          _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
+
+      const __m256i v_m0_b =
+          _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_r_b = xx_loadl_64(mask);
+        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_r_b = xx_loadu_128(mask);
+        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h);
+      break;
+  }
+}
+
+static INLINE void blend_a64_mask_sy_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storeu_128(dst, v_res_b);
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_ra_b = yy_loadu_256(mask + c);
+      const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
+      const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_ra_b = xx_loadl_32(mask);
+        const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_ra_b = xx_loadl_64(mask);
+        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h);
+  }
+}
+
+static INLINE void blend_a64_mask_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_m0_b = yy_loadu_256(mask + c);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_m0_b = xx_loadl_32(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_m0_b = xx_loadl_64(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      do {
+        const __m128i v_m0_b = xx_loadu_128(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storeu_128(dst, v_res_b);
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    default:
+      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h);
+  }
+}
+
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
+                             const uint8_t *src0, uint32_t src0_stride,
+                             const uint8_t *src1, uint32_t src1_stride,
+                             const uint8_t *mask, uint32_t mask_stride, int w,
+                             int h, int subx, int suby) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                         mask, mask_stride, w, h, subx, suby);
+  } else {
+    if (subx & suby) {
+      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, mask_stride, w, h);
+    } else if (subx) {
+      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
+                             src1_stride, mask, mask_stride, w, h);
+    } else if (suby) {
+      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+                             src1_stride, mask, mask_stride, w, h);
+    } else {
+      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                          mask, mask_stride, w, h);
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
index 49c20b467..9d6b4c2f7 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -20,6 +20,7 @@
 
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
 
 #include "config/aom_dsp_rtcd.h"
 
@@ -32,19 +33,13 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                      const uint8_t *src1, uint32_t src1_stride,
                                      const uint8_t *mask, uint32_t mask_stride,
                                      int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
-
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     const __m128i v_m0_b = xx_loadl_32(mask);
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     xx_storel_32(dst, v_res_b);
 
     dst += dst_stride;
@@ -59,19 +54,13 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                      const uint8_t *src1, uint32_t src1_stride,
                                      const uint8_t *mask, uint32_t mask_stride,
                                      int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
-
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     const __m128i v_m0_b = xx_loadl_64(mask);
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     xx_storel_64(dst, v_res_b);
 
     dst += dst_stride;
@@ -85,23 +74,17 @@ static void blend_a64_mask_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
 
   do {
     int c;
     for (c = 0; c < w; c += 16) {
-      const __m128i v_m0l_b = xx_loadl_64(mask + c);
-      const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
-      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
-      const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
+      const __m128i v_m0_b = xx_loadu_128(mask + c);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
 
       xx_storeu_128(dst + c, v_res_b);
     }
@@ -120,23 +103,20 @@ static void blend_a64_mask_sx_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
 
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     const __m128i v_r_b = xx_loadl_64(mask);
-    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
-    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     xx_storel_32(dst, v_res_b);
 
     dst += dst_stride;
@@ -150,22 +130,20 @@ static void blend_a64_mask_sx_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
 
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     const __m128i v_r_b = xx_loadu_128(mask);
-    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
-    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
 
     xx_storel_64(dst, v_res_b);
 
@@ -180,28 +158,24 @@ static void blend_a64_mask_sx_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
 
   do {
     int c;
     for (c = 0; c < w; c += 16) {
-      const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
-      const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
-      const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
-
-      const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
-      const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+      const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
+      const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
+      const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
+      const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
+      const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
 
       xx_storeu_128(dst + c, v_res_b);
     }
@@ -220,21 +194,18 @@ static void blend_a64_mask_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
 
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
   do {
     const __m128i v_ra_b = xx_loadl_32(mask);
     const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
 
     xx_storel_32(dst, v_res_b);
 
@@ -249,21 +220,16 @@ static void blend_a64_mask_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
 
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     const __m128i v_ra_b = xx_loadl_64(mask);
     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
 
     xx_storel_64(dst, v_res_b);
 
@@ -278,26 +244,18 @@ static void blend_a64_mask_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zero = _mm_setzero_si128();
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     int c;
     for (c = 0; c < w; c += 16) {
       const __m128i v_ra_b = xx_loadu_128(mask + c);
       const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
-      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+      const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
-      const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
 
       xx_storeu_128(dst + c, v_res_b);
     }
@@ -316,27 +274,24 @@ static void blend_a64_mask_sx_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   (void)w;
 
   do {
     const __m128i v_ra_b = xx_loadl_64(mask);
     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w =
-        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
+    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
 
     xx_storel_32(dst, v_res_b);
 
@@ -351,27 +306,25 @@ static void blend_a64_mask_sx_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   (void)w;
 
   do {
     const __m128i v_ra_b = xx_loadu_128(mask);
     const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w =
-        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
 
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
 
     xx_storel_64(dst, v_res_b);
 
@@ -388,8 +341,8 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     int c;
     for (c = 0; c < w; c += 16) {
@@ -410,14 +363,11 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
 
       const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
       const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
+      const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
 
       xx_storeu_128(dst + c, v_res_b);
     }
@@ -921,24 +871,140 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
   }
 }
 
-static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0,
-                                      const CONV_BUF_TYPE *src1,
-                                      const __m128i *m,
-                                      const __m128i *v_round_offset,
-                                      const __m128i *v_maxval, int round_bits) {
-  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
-  const __m128i s0 = xx_loadl_64(src0);
-  const __m128i s1 = xx_loadl_64(src1);
-  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
-  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
-  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
-  const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS);
-  const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset);
-  const __m128i res_d = xx_roundn_epi32(res_c, round_bits);
-  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
-  const __m128i res = _mm_packus_epi16(res_e, res_e);
-
-  xx_storel_32(dst, res);
+static INLINE void blend_a64_d16_mask_w16_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
+    const __m128i *v_maxval, int shift) {
+  const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
+  const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
+  const __m128i s0_0 = xx_loadu_128(src0);
+  const __m128i s0_1 = xx_loadu_128(src0 + 8);
+  const __m128i s1_0 = xx_loadu_128(src1);
+  const __m128i s1_1 = xx_loadu_128(src1 + 8);
+  __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
+                                   _mm_unpacklo_epi16(*m0, max_minus_m0));
+  __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
+                                   _mm_unpackhi_epi16(*m0, max_minus_m0));
+  __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
+                                   _mm_unpacklo_epi16(*m1, max_minus_m1));
+  __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
+                                   _mm_unpackhi_epi16(*m1, max_minus_m1));
+  res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
+  res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
+  res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
+  const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
+  const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
+  const __m128i res = _mm_packus_epi16(res0, res1);
+
+  _mm_storeu_si128((__m128i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m = xx_loadu_128(mask + j);
+      const __m128i m0 = _mm_cvtepu8_epi16(m);
+      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+      const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+      const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
+      const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
+      const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
+      const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + j);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+      const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
+      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
 }
 
 void aom_lowbd_blend_a64_d16_mask_sse4_1(
@@ -947,12 +1013,15 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1(
     const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
     ConvolveParams *conv_params) {
   const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
 
+  const int round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
   assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
 
@@ -961,69 +1030,80 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1(
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i v_ro_a = xx_loadl_32(&round_offset);
-  const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0);
-  const __m128i one_w = _mm_set1_epi16(1);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i two_w = _mm_set1_epi16(2);
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
 
   if (subw == 0 && subh == 0) {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; j += 4) {
-        const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]);
-        const __m128i m = _mm_cvtepu8_epi16(m0);
-
-        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
-                           &src1[i * src1_stride + j], &m, &v_round_offset,
-                           &v_maxval, round_bits);
-      }
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
     }
+
   } else if (subw == 1 && subh == 1) {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; j += 4) {
-        const __m128i m_i0 =
-            xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]);
-        const __m128i m_i1 =
-            xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]);
-        const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
-        const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b);
-        const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd);
-        const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
-        const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
-
-        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
-                           &src1[i * src1_stride + j], &m, &v_round_offset,
-                           &v_maxval, round_bits);
-      }
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
     }
   } else if (subw == 1 && subh == 0) {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; j += 4) {
-        const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]);
-        const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
-        const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
-        const __m128i m = _mm_srli_epi16(m_ac_1, 1);
-
-        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
-                           &src1[i * src1_stride + j], &m, &v_round_offset,
-                           &v_maxval, round_bits);
-      }
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
     }
   } else {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; j += 4) {
-        const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]);
-        const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]);
-        const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1);
-        const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b);
-        const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
-        const __m128i m = _mm_srli_epi16(m_ac_1, 1);
-
-        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
-                           &src1[i * src1_stride + j], &m, &v_round_offset,
-                           &v_maxval, round_bits);
-      }
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
     }
   }
 }
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
index 59506bdfe..064910232 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -39,7 +39,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
     const __m128i v_m0_w = _mm_set1_epi16(*mask);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+    const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
@@ -64,7 +64,7 @@ static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
     const __m128i v_m0_w = _mm_set1_epi16(*mask);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+    const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
@@ -90,9 +90,9 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
     const __m128i v_m0_w = _mm_set1_epi16(*mask);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     for (c = 0; c < w; c += 16) {
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w);
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
       const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w);
+          blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
new file mode 100644
index 000000000..c071fdcfc
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w4_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+    int shift) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadl_64(src0);
+  const __m128i s1 = xx_loadl_64(src1);
+  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+  const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
+  const __m128i res_d = _mm_srai_epi32(res_c, shift);
+  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  xx_storel_32(dst, res);
+}
+
+static INLINE void blend_a64_d16_mask_w8_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+    int shift) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadu_128(src0);
+  const __m128i s1 = xx_loadu_128(src1);
+  __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
+                                  _mm_unpacklo_epi16(*m, max_minus_m));
+  __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
+                                  _mm_unpackhi_epi16(*m, max_minus_m));
+  res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
+  res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
+  const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  _mm_storel_epi64((__m128i *)(dst), res);
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m0 = xx_loadl_32(mask);
+    const __m128i m = _mm_cvtepu8_epi16(m0);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m0 = xx_loadl_64(mask);
+    const __m128i m = _mm_cvtepu8_epi16(m0);
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadu_128(mask);
+    const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+    const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadu_128(mask);
+    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+    const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+#endif  // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
index 4880438bc..8d9b32510 100644
--- a/third_party/aom/aom_dsp/x86/blend_sse4.h
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -9,42 +9,44 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_BLEND_SSE4_H_
-#define AOM_DSP_X86_BLEND_SSE4_H_
+#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
 
 #include "aom_dsp/blend.h"
 #include "aom_dsp/x86/synonyms.h"
+static const uint8_t g_blend_a64_mask_shuffle[32] = {
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+};
 
 //////////////////////////////////////////////////////////////////////////////
 // Common kernels
 //////////////////////////////////////////////////////////////////////////////
 
 static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
-                              const __m128i v_m0_w, const __m128i v_m1_w) {
+                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
   const __m128i v_s0_b = xx_loadl_32(src0);
   const __m128i v_s1_b = xx_loadl_32(src1);
   const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
   const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
 
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
   const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
 
   return v_res_w;
 }
 
 static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
-                              const __m128i v_m0_w, const __m128i v_m1_w) {
+                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
   const __m128i v_s0_b = xx_loadl_64(src0);
   const __m128i v_s1_b = xx_loadl_64(src1);
   const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
   const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
 
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
 
   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
 
@@ -53,6 +55,51 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
   return v_res_w;
 }
 
+static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
+                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                 const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+  return v_res;
+}
+
+static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
+                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                 const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+  return v_res;
+}
+
+static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
+                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                  const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadu_128(src0);
+  const __m128i v_s1_b = xx_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
+  return v_res;
+}
+
 typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
                                  const __m128i v_m0_w, const __m128i v_m1_w);
 
@@ -141,4 +188,4 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
   return v_res_w;
 }
 
-#endif  // AOM_DSP_X86_BLEND_SSE4_H_
+#endif  // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
index 3f46420dd..96fe4ebb6 100644
--- a/third_party/aom/aom_dsp/x86/common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_COMMON_AVX2_H
-#define AOM_DSP_X86_COMMON_AVX2_H
+#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
 
 #include <immintrin.h>
 
@@ -144,4 +144,4 @@ static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
   out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
   out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
 }
-#endif
+#endif  // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
index 36fb1963a..3e19682cd 100644
--- a/third_party/aom/aom_dsp/x86/convolve.h
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AOM_DSP_X86_CONVOLVE_H_
-#define AOM_DSP_X86_CONVOLVE_H_
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_H_
 
 #include <assert.h>
 
@@ -17,7 +17,6 @@
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
-#include "aom_dsp/aom_convolve.h"
 
 typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
@@ -34,7 +33,30 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     (void)y_step_q4;                                                         \
     assert((-128 <= filter[3]) && (filter[3] <= 127));                       \
     assert(step_q4 == 16);                                                   \
-    if (filter[0] | filter[1] | filter[2]) {                                 \
+    if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&            \
+        (filter[2] | filter[5])) {                                           \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    } else if (filter[0] | filter[1] | filter[2]) {                          \
       while (w >= 16) {                                                      \
         aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
                                                  dst_stride, h, filter);     \
@@ -153,4 +175,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
     }                                                                      \
   }
 
-#endif  // AOM_DSP_X86_CONVOLVE_H_
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
index 72fabd236..30253f65c 100644
--- a/third_party/aom/aom_dsp/x86/convolve_avx2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_
-#define AOM_DSP_X86_CONVOLVE_AVX2_H_
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
 
 // filters for 16
 DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
@@ -29,6 +29,11 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
   7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
 };
 
+DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
 static INLINE void prepare_coeffs_lowbd(
     const InterpFilterParams *const filter_params, const int subpel_q4,
     __m256i *const coeffs /* [4] */) {
@@ -191,4 +196,4 @@ static INLINE __m256i highbd_convolve_rounding(
   return res_round;
 }
 
-#endif
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
index e80c5872f..707bd2d78 100644
--- a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
+++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
-#define _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
 
 // Note:
 //  This header file should be put below any x86 intrinsics head file
@@ -28,4 +28,4 @@ static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
   _mm_store_si128((__m128i *)dst, d);
 }
 
-#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
index 399df5d6d..445d04b10 100644
--- a/third_party/aom/aom_dsp/x86/convolve_sse2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_CONVOLVE_SSE2_H_
-#define AOM_DSP_X86_CONVOLVE_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
 
 // Note:
 //  This header file should be put below any x86 intrinsics head file
@@ -118,4 +118,4 @@ static INLINE __m128i highbd_convolve_rounding_sse2(
   return res_round;
 }
 
-#endif
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
index d48c25667..6b8388d84 100644
--- a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
+++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_
-#define _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
 
 // Note:
 //  This header file should be put below any x86 intrinsics head file
@@ -50,4 +50,4 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
   return res;
 }
 
-#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
index 12ccf7f26..260d8dd58 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
-#define AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -152,4 +152,4 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
 }  // extern "C"
 #endif
 
-#endif  // AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#endif  // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
deleted file mode 100644
index 99f17ebdf..000000000
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
+++ /dev/null
@@ -1,351 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-SECTION .text
-
-;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
-;                                            int ref_stride,
-;                                            unsigned char *src,
-;                                            int src_stride,
-;                                            unsigned int height,
-;                                            int *sum,
-;                                            unsigned int *sumsquared)
-global sym(aom_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(aom_half_horiz_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref
-
-        mov             rdi,            arg(2) ;src
-        movsxd          rcx,            dword ptr arg(4) ;height
-        movsxd          rax,            dword ptr arg(1) ;ref_stride
-        movsxd          rdx,            dword ptr arg(3)    ;src_stride
-
-        pxor            xmm0,           xmm0                ;
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-        lea             rsi,            [rsi + rax]
-
-aom_half_horiz_vert_variance16x_h_1:
-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-
-        movq            xmm3,           QWORD PTR [rdi+8]
-        punpcklbw       xmm3,           xmm0
-        psubw           xmm4,           xmm3
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             aom_half_horiz_vert_variance16x_h_1     ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_half_vert_variance16x_h_sse2(unsigned char *ref,
-;                                      int ref_stride,
-;                                      unsigned char *src,
-;                                      int src_stride,
-;                                      unsigned int height,
-;                                      int *sum,
-;                                      unsigned int *sumsquared)
-global sym(aom_half_vert_variance16x_h_sse2) PRIVATE
-sym(aom_half_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0)              ;ref
-
-        mov             rdi,            arg(2)              ;src
-        movsxd          rcx,            dword ptr arg(4)    ;height
-        movsxd          rax,            dword ptr arg(1)    ;ref_stride
-        movsxd          rdx,            dword ptr arg(3)    ;src_stride
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        lea             rsi,            [rsi + rax          ]
-        pxor            xmm0,           xmm0
-
-aom_half_vert_variance16x_h_1:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm2,           QWORD PTR [rdi]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm5,           xmm2
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm4,           xmm2
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm3
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1
-        jnz             aom_half_vert_variance16x_h_1
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_half_horiz_variance16x_h_sse2(unsigned char *ref,
-;                                       int ref_stride
-;                                       unsigned char *src,
-;                                       int src_stride,
-;                                       unsigned int height,
-;                                       int *sum,
-;                                       unsigned int *sumsquared)
-global sym(aom_half_horiz_variance16x_h_sse2) PRIVATE
-sym(aom_half_horiz_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref
-
-        mov             rdi,            arg(2) ;src
-        movsxd          rcx,            dword ptr arg(4) ;height
-        movsxd          rax,            dword ptr arg(1) ;ref_stride
-        movsxd          rdx,            dword ptr arg(3)    ;src_stride
-
-        pxor            xmm0,           xmm0                ;
-
-aom_half_horiz_variance16x_h_1:
-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm1,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm1,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        psubw           xmm1,           xmm2
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm1
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm1,           xmm1
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm1
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             aom_half_horiz_variance16x_h_1        ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-aom_bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
deleted file mode 100644
index 2a018c1cf..000000000
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
-                                            int ref_stride,
-                                            const unsigned char *src,
-                                            int src_stride, unsigned int height,
-                                            int *sum, unsigned int *sumsquared);
-void aom_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
-                                       const unsigned char *src, int src_stride,
-                                       unsigned int height, int *sum,
-                                       unsigned int *sumsquared);
-void aom_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
-                                      const unsigned char *src, int src_stride,
-                                      unsigned int height, int *sum,
-                                      unsigned int *sumsquared);
-
-uint32_t aom_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
-                                             int src_stride,
-                                             const unsigned char *dst,
-                                             int dst_stride, uint32_t *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  aom_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
-                                    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  assert(xsum0 <= 255 * 16 * 16);
-  assert(xsum0 >= -255 * 16 * 16);
-  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t aom_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
-                                             int src_stride,
-                                             const unsigned char *dst,
-                                             int dst_stride, uint32_t *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-  aom_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0,
-                                   &xxsum0);
-
-  *sse = xxsum0;
-  assert(xsum0 <= 255 * 16 * 16);
-  assert(xsum0 >= -255 * 16 * 16);
-  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t aom_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
-                                              int src_stride,
-                                              const unsigned char *dst,
-                                              int dst_stride, uint32_t *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  aom_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
-                                         &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  assert(xsum0 <= 255 * 16 * 16);
-  assert(xsum0 >= -255 * 16 * 16);
-  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 83e0098ba..097e0778f 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -327,6 +327,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
     __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
     const unsigned char *lt, const unsigned char *thr, int bd) {
   int i;
+  const __m128i zero = _mm_setzero_si128();
   __m128i blimit, limit, thresh;
   __m128i t80;
   get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
@@ -355,13 +356,18 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
   flat2 = _mm_unpacklo_epi64(flat2, flat2);
 
   // flat and wide flat calculations
-  __m128i flat_p[3], flat_q[3], flat_pq[3];
-  __m128i flat2_p[6], flat2_q[6];
-  __m128i flat2_pq[6];
-  {
-    __m128i work0;
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i flat_p[3], flat_q[3], flat_pq[3];
+    __m128i flat2_p[6], flat2_q[6];
+    __m128i flat2_pq[6];
+    __m128i sum_p6, sum_p3;
     const __m128i eight = _mm_set1_epi16(8);
     const __m128i four = _mm_set1_epi16(4);
+
+    __m128i work0, work0_0, work0_1, sum_p_0;
     __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
     __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
     sum_p = _mm_add_epi16(sum_p, sum_lp);
@@ -369,30 +375,23 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
     __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
     __m128i sum_q = _mm_srli_si128(sum_p, 8);
 
-    sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
     sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
 
-    work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
-    flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0]));
-    flat2_q[0] =
-        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0]));
-
-    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0]));
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
     flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
 
-    __m128i sum_p6, sum_p3;
     sum_p6 = _mm_add_epi16(pq[6], pq[6]);
     sum_p3 = _mm_add_epi16(pq[3], pq[3]);
 
-    sum_q = _mm_sub_epi16(sum_p, p[5]);
-    sum_p = _mm_sub_epi16(sum_p, q[5]);
+    sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
+    sum_p = _mm_sub_epi16(sum_p_0, q[5]);
 
-    work0 = _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
-    flat2_p[1] = _mm_add_epi16(sum_p, work0);
-    flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+    work0_1 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
 
-    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+    sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
     sum_lp = _mm_sub_epi16(sum_lp, q[2]);
 
     work0 = _mm_add_epi16(sum_p3, pq[1]);
@@ -402,21 +401,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
     flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
     flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
 
-    flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
-    flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-
-    sum_p = _mm_sub_epi16(sum_p, q[4]);
-    sum_q = _mm_sub_epi16(sum_q, p[4]);
-
-    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-    work0 = _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
-    flat2_p[2] = _mm_add_epi16(sum_p, work0);
-    flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-    flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-
     sum_lp = _mm_sub_epi16(sum_lp, q[1]);
-    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+    sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
 
     sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
     work0 = _mm_add_epi16(sum_p3, pq[2]);
@@ -425,54 +411,88 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
     flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
     flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
 
-    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-    sum_p = _mm_sub_epi16(sum_p, q[3]);
-    sum_q = _mm_sub_epi16(sum_q, p[3]);
-
-    work0 = _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
-    flat2_p[3] = _mm_add_epi16(sum_p, work0);
-    flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-    flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-
-    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-    sum_p = _mm_sub_epi16(sum_p, q[2]);
-    sum_q = _mm_sub_epi16(sum_q, p[2]);
-
-    work0 = _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
-    flat2_p[4] = _mm_add_epi16(sum_p, work0);
-    flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-    flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-
-    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-    sum_p = _mm_sub_epi16(sum_p, q[1]);
-    sum_q = _mm_sub_epi16(sum_q, p[1]);
-
-    work0 = _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
-    flat2_p[5] = _mm_add_epi16(sum_p, work0);
-    flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-    flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
-  }
-
-  // highbd_filter8
-  pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
-  pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-
-  for (i = 0; i < 3; i++) {
-    pq[i] = _mm_andnot_si128(flat, pq[i]);
-    flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
-    pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
-  }
-
-  // highbd_filter16
-  for (i = 5; i >= 0; i--) {
-    //  p[i] remains unchanged if !(flat2 && flat && mask)
-    pq[i] = _mm_andnot_si128(flat2, pq[i]);
-    flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
-    //  get values for when (flat2 && flat && mask)
-    pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
+    int flat2_mask =
+        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+    if (flat2_mask) {
+      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
+      flat2_q[0] = _mm_add_epi16(
+          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
+
+      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+      flat2_pq[0] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+      flat2_pq[1] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+      sum_p = _mm_sub_epi16(sum_p, q[4]);
+      sum_q = _mm_sub_epi16(sum_q, pq[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+      flat2_p[2] = _mm_add_epi16(sum_p, work0);
+      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[2] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[3]);
+      sum_q = _mm_sub_epi16(sum_q, pq[3]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+      flat2_p[3] = _mm_add_epi16(sum_p, work0);
+      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[3] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[2]);
+      sum_q = _mm_sub_epi16(sum_q, pq[2]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+      flat2_p[4] = _mm_add_epi16(sum_p, work0);
+      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[4] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[1]);
+      sum_q = _mm_sub_epi16(sum_q, pq[1]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+      flat2_p[5] = _mm_add_epi16(sum_p, work0);
+      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[5] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+    }  // flat2
+       // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // highbd_filter8
+    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+    for (i = 0; i < 3; i++) {
+      pq[i] = _mm_andnot_si128(flat, pq[i]);
+      flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+      pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+    }
+
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if (flat2_mask) {
+      for (i = 0; i < 6; i++) {
+        pq[i] = _mm_andnot_si128(flat2, pq[i]);
+        flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+        pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
+      }
+    }
+  } else {
+    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
   }
 }
 
@@ -500,6 +520,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
     const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
     const uint8_t *thr1, int bd) {
   __m128i blimit, limit, thresh, t80;
+  const __m128i zero = _mm_setzero_si128();
+
   get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
                  &t80);
   __m128i mask;
@@ -512,27 +534,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
   __m128i ps[2], qs[2];
   highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
   // flat and wide flat calculations
-  __m128i flat_p[3], flat_q[3];
-  __m128i flat2_p[6], flat2_q[6];
-  {
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i flat_p[3], flat_q[3];
+    __m128i flat2_p[6], flat2_q[6];
     const __m128i eight = _mm_set1_epi16(8);
     const __m128i four = _mm_set1_epi16(4);
-    __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+    __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
     __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
     __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
-    sum_p = _mm_add_epi16(sum_p, sum_lp);
+    sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
     __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
     sum_q = _mm_add_epi16(sum_q, sum_lq);
-    sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
     sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-    flat2_p[0] = _mm_srli_epi16(
-        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
-                                           _mm_add_epi16(p[1], q[0]))),
-        4);
-    flat2_q[0] = _mm_srli_epi16(
-        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
-                                           _mm_add_epi16(p[0], q[1]))),
-        4);
     flat_p[0] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
     flat_q[0] =
@@ -541,117 +558,160 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
     __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
     __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
     __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
-    sum_q = _mm_sub_epi16(sum_p, p[5]);
-    sum_p = _mm_sub_epi16(sum_p, q[5]);
-    flat2_p[1] = _mm_srli_epi16(
-        _mm_add_epi16(
-            sum_p, _mm_add_epi16(
-                       sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
-        4);
-    flat2_q[1] = _mm_srli_epi16(
-        _mm_add_epi16(
-            sum_q, _mm_add_epi16(
-                       sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
-        4);
+
+    sum_q = _mm_sub_epi16(sum_p_0, p[5]);
+    __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
     sum_lq = _mm_sub_epi16(sum_lp, p[2]);
     sum_lp = _mm_sub_epi16(sum_lp, q[2]);
     flat_p[1] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
     flat_q[1] =
         _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
-    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-    sum_p3 = _mm_add_epi16(sum_p3, p[3]);
-    sum_q3 = _mm_add_epi16(sum_q3, q[3]);
-    sum_p = _mm_sub_epi16(sum_p, q[4]);
-    sum_q = _mm_sub_epi16(sum_q, p[4]);
-    flat2_p[2] = _mm_srli_epi16(
-        _mm_add_epi16(
-            sum_p, _mm_add_epi16(
-                       sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
-        4);
-    flat2_q[2] = _mm_srli_epi16(
-        _mm_add_epi16(
-            sum_q, _mm_add_epi16(
-                       sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
-        4);
+
     sum_lp = _mm_sub_epi16(sum_lp, q[1]);
     sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+    sum_p3 = _mm_add_epi16(sum_p3, p[3]);
+    sum_q3 = _mm_add_epi16(sum_q3, q[3]);
     flat_p[2] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
     flat_q[2] =
         _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
-    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-    sum_p = _mm_sub_epi16(sum_p, q[3]);
-    sum_q = _mm_sub_epi16(sum_q, p[3]);
-    flat2_p[3] = _mm_srli_epi16(
-        _mm_add_epi16(
-            sum_p, _mm_add_epi16(
-                       sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
-        4);
-    flat2_q[3] = _mm_srli_epi16(
-        _mm_add_epi16(
-            sum_q, _mm_add_epi16(
-                       sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
-        4);
-    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-    sum_p = _mm_sub_epi16(sum_p, q[2]);
-    sum_q = _mm_sub_epi16(sum_q, p[2]);
-    flat2_p[4] = _mm_srli_epi16(
-        _mm_add_epi16(
-            sum_p, _mm_add_epi16(
-                       sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
-        4);
-    flat2_q[4] = _mm_srli_epi16(
-        _mm_add_epi16(
-            sum_q, _mm_add_epi16(
-                       sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
-        4);
-    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-    sum_p = _mm_sub_epi16(sum_p, q[1]);
-    sum_q = _mm_sub_epi16(sum_q, p[1]);
-    flat2_p[5] = _mm_srli_epi16(
-        _mm_add_epi16(
-            sum_p, _mm_add_epi16(
-                       sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
-        4);
-    flat2_q[5] = _mm_srli_epi16(
-        _mm_add_epi16(
-            sum_q, _mm_add_epi16(
-                       sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
-        4);
-  }
-  // highbd_filter8
-  p[2] = _mm_andnot_si128(flat, p[2]);
-  //  p2 remains unchanged if !(flat && mask)
-  flat_p[2] = _mm_and_si128(flat, flat_p[2]);
-  //  when (flat && mask)
-  p[2] = _mm_or_si128(p[2], flat_p[2]);  // full list of p2 values
-  q[2] = _mm_andnot_si128(flat, q[2]);
-  flat_q[2] = _mm_and_si128(flat, flat_q[2]);
-  q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
-  int i;
-  for (i = 1; i >= 0; i--) {
-    ps[i] = _mm_andnot_si128(flat, ps[i]);
-    flat_p[i] = _mm_and_si128(flat, flat_p[i]);
-    p[i] = _mm_or_si128(ps[i], flat_p[i]);
-    qs[i] = _mm_andnot_si128(flat, qs[i]);
-    flat_q[i] = _mm_and_si128(flat, flat_q[i]);
-    q[i] = _mm_or_si128(qs[i], flat_q[i]);
-  }
-  // highbd_filter16
-  for (i = 5; i >= 0; i--) {
-    //  p[i] remains unchanged if !(flat2 && flat && mask)
-    p[i] = _mm_andnot_si128(flat2, p[i]);
-    flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
-    //  get values for when (flat2 && flat && mask)
-    p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
-    q[i] = _mm_andnot_si128(flat2, q[i]);
-    flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
-    q[i] = _mm_or_si128(q[i], flat2_q[i]);
+
+    int flat2_mask =
+        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+    if (flat2_mask) {
+      flat2_p[0] = _mm_srli_epi16(
+          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+                                               _mm_add_epi16(p[1], q[0]))),
+          4);
+      flat2_q[0] = _mm_srli_epi16(
+          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+                                               _mm_add_epi16(p[0], q[1]))),
+          4);
+
+      flat2_p[1] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+          4);
+      flat2_q[1] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[4]);
+      sum_q = _mm_sub_epi16(sum_q, p[4]);
+      flat2_p[2] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+          4);
+      flat2_q[2] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[3]);
+      sum_q = _mm_sub_epi16(sum_q, p[3]);
+      flat2_p[3] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+          4);
+      flat2_q[3] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[2]);
+      sum_q = _mm_sub_epi16(sum_q, p[2]);
+      flat2_p[4] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+          4);
+      flat2_q[4] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[1]);
+      sum_q = _mm_sub_epi16(sum_q, p[1]);
+      flat2_p[5] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+          4);
+      flat2_q[5] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+          4);
+    }
+    // highbd_filter8
+    int i;
+    for (i = 0; i < 2; i++) {
+      ps[i] = _mm_andnot_si128(flat, ps[i]);
+      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+      p[i] = _mm_or_si128(ps[i], flat_p[i]);
+      qs[i] = _mm_andnot_si128(flat, qs[i]);
+      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+      q[i] = _mm_or_si128(qs[i], flat_q[i]);
+    }
+    p[2] = _mm_andnot_si128(flat, p[2]);
+    //  p2 remains unchanged if !(flat && mask)
+    flat_p[2] = _mm_and_si128(flat, flat_p[2]);
+    //  when (flat && mask)
+    p[2] = _mm_or_si128(p[2], flat_p[2]);  // full list of p2 values
+    q[2] = _mm_andnot_si128(flat, q[2]);
+    flat_q[2] = _mm_and_si128(flat, flat_q[2]);
+    q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
+
+    for (i = 0; i < 2; i++) {
+      ps[i] = _mm_andnot_si128(flat, ps[i]);
+      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+      p[i] = _mm_or_si128(ps[i], flat_p[i]);
+      qs[i] = _mm_andnot_si128(flat, qs[i]);
+      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+      q[i] = _mm_or_si128(qs[i], flat_q[i]);
+    }
+    // highbd_filter16
+    if (flat2_mask) {
+      for (i = 0; i < 6; i++) {
+        //  p[i] remains unchanged if !(flat2 && flat && mask)
+        p[i] = _mm_andnot_si128(flat2, p[i]);
+        flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+        //  get values for when (flat2 && flat && mask)
+        p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
+        q[i] = _mm_andnot_si128(flat2, q[i]);
+        flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+        q[i] = _mm_or_si128(q[i], flat2_q[i]);
+      }
+    }
+  } else {
+    p[0] = ps[0];
+    q[0] = qs[0];
+    p[1] = ps[1];
+    q[1] = qs[1];
   }
 }
 
@@ -696,6 +756,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
   highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
                                 &thresh, &hev, &mask);
 
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
   // flat_mask
   flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
   flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
@@ -707,53 +770,56 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
   // replicate for the further "merged variables" usage
   flat = _mm_unpacklo_epi64(flat, flat);
 
-  {
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_c;
+    __m128i pq0x2_pq1, pq1_pq2;
 
     // op1
-    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
-                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
-                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
+    pq0x2_pq1 =
+        _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]);  // p0 *2 + p1
+    pq1_pq2 = _mm_add_epi16(pq[1], pq[2]);                  // p1 + p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
 
-    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
-    workp_shft0 = _mm_add_epi16(
-        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+    workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
+    workp_b =
+        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
 
     // op0
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
-    workp_a =
-        _mm_add_epi16(workp_a,
-                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
-
-    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3);
+    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+    flat_p1p0 = _mm_srli_epi16(workp_b, 3);
 
     // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
-                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
-    workp_b = _mm_add_epi16(*q1, *q2);
-    workp_shft0 = _mm_add_epi16(
-        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
+                            pq[1]);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_srli_si128(pq1_pq2, 8);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
 
     // oq1
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
-                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
+                            pq[0]);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
     workp_b = _mm_add_epi16(*q2, *q2);
-    workp_shft1 = _mm_add_epi16(
-        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+    workp_b =
+        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
 
-    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
-  }
-  // lp filter
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
+    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+    flat_q0q1 = _mm_srli_epi16(workp_a, 3);
 
-  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
-  q1q0 = _mm_and_si128(flat, flat_q0q1);
-  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 
-  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
-  p1p0 = _mm_and_si128(flat, flat_p1p0);
-  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
 }
 
 static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
@@ -797,6 +863,17 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
   mask = _mm_subs_epu16(mask, limit0);
   mask = _mm_cmpeq_epi16(mask, zero);
 
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
   // flat_mask
   flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
   flat = _mm_max_epi16(flat, work);
@@ -806,7 +883,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
   flat = _mm_cmpeq_epi16(flat, zero);
   flat = _mm_and_si128(flat, mask);  // flat & mask
 
-  {
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
     __m128i workp_a, workp_b, workp_shft0, workp_shft1;
 
     // op1
@@ -842,33 +921,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
     workp_shft1 = _mm_add_epi16(
         workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
     oq1 = _mm_srli_epi16(workp_shft1, 3);
-  }
-  // lp filter
-  __m128i ps[2], qs[2], p[2], q[2];
-  {
-    p[0] = *p0;
-    p[1] = *p1;
-    q[0] = *q0;
-    q[1] = *q1;
-    // filter_mask and hev_mask
-    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-  }
-
-  qs[0] = _mm_andnot_si128(flat, qs[0]);
-  oq0 = _mm_and_si128(flat, oq0);
-  *q0 = _mm_or_si128(qs[0], oq0);
 
-  qs[1] = _mm_andnot_si128(flat, qs[1]);
-  oq1 = _mm_and_si128(flat, oq1);
-  *q1 = _mm_or_si128(qs[1], oq1);
-
-  ps[0] = _mm_andnot_si128(flat, ps[0]);
-  op0 = _mm_and_si128(flat, op0);
-  *p0 = _mm_or_si128(ps[0], op0);
-
-  ps[1] = _mm_andnot_si128(flat, ps[1]);
-  op1 = _mm_and_si128(flat, op1);
-  *p1 = _mm_or_si128(ps[1], op1);
+    qs[0] = _mm_andnot_si128(flat, qs[0]);
+    oq0 = _mm_and_si128(flat, oq0);
+    *q0 = _mm_or_si128(qs[0], oq0);
+
+    qs[1] = _mm_andnot_si128(flat, qs[1]);
+    oq1 = _mm_and_si128(flat, oq1);
+    *q1 = _mm_or_si128(qs[1], oq1);
+
+    ps[0] = _mm_andnot_si128(flat, ps[0]);
+    op0 = _mm_and_si128(flat, op0);
+    *p0 = _mm_or_si128(ps[0], op0);
+
+    ps[1] = _mm_andnot_si128(flat, ps[1]);
+    op1 = _mm_and_si128(flat, op1);
+    *p1 = _mm_or_si128(ps[1], op1);
+  } else {
+    *q0 = qs[0];
+    *q1 = qs[1];
+    *p0 = ps[0];
+    *p1 = ps[1];
+  }
 }
 
 void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
@@ -926,7 +1000,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
   __m128i mask, hev, flat;
   __m128i pq[4];
   __m128i p1p0, q1q0, ps1ps0, qs1qs0;
-  __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1;
+  __m128i work_a, opq2, flat_p1p0, flat_q0q1;
 
   pq[0] = _mm_unpacklo_epi64(*p0, *q0);
   pq[1] = _mm_unpacklo_epi64(*p1, *q1);
@@ -944,6 +1018,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
   highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
                                 &thresh, &hev, &mask);
 
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
   // flat_mask4
   flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
   flat = _mm_max_epi16(abs_p1p0, flat);
@@ -956,15 +1033,15 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
   // replicate for the further "merged variables" usage
   flat = _mm_unpacklo_epi64(flat, flat);
 
-  {
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
     // Added before shift for rounding part of ROUND_POWER_OF_TWO
 
     // o*p2
     workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
     workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
-    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    workp_c = _mm_add_epi16(workp_a, workp_c);
 
     // o*p1
     workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
@@ -992,27 +1069,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
     // oq2
     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
-    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  }
+    workp_a = _mm_add_epi16(workp_a, workp_b);
+    opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
 
-  // lp filter
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
-
-  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
-  q1q0 = _mm_and_si128(flat, flat_q0q1);
-  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
-  p1p0 = _mm_and_si128(flat, flat_p1p0);
-  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 
-  work_a = _mm_andnot_si128(flat, *q2);
-  *q2 = _mm_and_si128(flat, oq2);
-  *q2 = _mm_or_si128(work_a, *q2);
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
 
-  work_a = _mm_andnot_si128(flat, *p2);
-  *p2 = _mm_and_si128(flat, op2);
-  *p2 = _mm_or_si128(work_a, *p2);
+    work_a = _mm_andnot_si128(flat, pq[2]);
+    *p2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_a, *p2);
+    *q2 = _mm_srli_si128(*p2, 8);
+  }
 }
 
 static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
@@ -1058,17 +1130,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
   mask = _mm_subs_epu16(mask, limit0);
   mask = _mm_cmpeq_epi16(mask, zero);
 
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
   flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
   flat = _mm_max_epi16(work1, flat);
   work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
   flat = _mm_max_epi16(work0, flat);
 
   flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
   flat = _mm_cmpeq_epi16(flat, zero);
   flat = _mm_and_si128(flat, mask);  // flat & mask
 
-  {
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
     __m128i workp_a, workp_b;
     // Added before shift for rounding part of ROUND_POWER_OF_TWO
 
@@ -1101,42 +1184,36 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
     oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  }
 
-  // lp filter
-  __m128i ps[2], qs[2], p[2], q[2];
-  {
-    p[0] = *p0;
-    p[1] = *p1;
-    q[0] = *q0;
-    q[1] = *q1;
-    // filter_mask and hev_mask
-    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+    qs[0] = _mm_andnot_si128(flat, qs[0]);
+    oq0 = _mm_and_si128(flat, oq0);
+    *q0 = _mm_or_si128(qs[0], oq0);
+
+    qs[1] = _mm_andnot_si128(flat, qs[1]);
+    oq1 = _mm_and_si128(flat, oq1);
+    *q1 = _mm_or_si128(qs[1], oq1);
+
+    ps[0] = _mm_andnot_si128(flat, ps[0]);
+    op0 = _mm_and_si128(flat, op0);
+    *p0 = _mm_or_si128(ps[0], op0);
+
+    ps[1] = _mm_andnot_si128(flat, ps[1]);
+    op1 = _mm_and_si128(flat, op1);
+    *p1 = _mm_or_si128(ps[1], op1);
+
+    work_a = _mm_andnot_si128(flat, *q2);
+    *q2 = _mm_and_si128(flat, oq2);
+    *q2 = _mm_or_si128(work_a, *q2);
+
+    work_a = _mm_andnot_si128(flat, *p2);
+    *p2 = _mm_and_si128(flat, op2);
+    *p2 = _mm_or_si128(work_a, *p2);
+  } else {
+    *q0 = qs[0];
+    *q1 = qs[1];
+    *p0 = ps[0];
+    *p1 = ps[1];
   }
-
-  qs[0] = _mm_andnot_si128(flat, qs[0]);
-  oq0 = _mm_and_si128(flat, oq0);
-  *q0 = _mm_or_si128(qs[0], oq0);
-
-  qs[1] = _mm_andnot_si128(flat, qs[1]);
-  oq1 = _mm_and_si128(flat, oq1);
-  *q1 = _mm_or_si128(qs[1], oq1);
-
-  ps[0] = _mm_andnot_si128(flat, ps[0]);
-  op0 = _mm_and_si128(flat, op0);
-  *p0 = _mm_or_si128(ps[0], op0);
-
-  ps[1] = _mm_andnot_si128(flat, ps[1]);
-  op1 = _mm_and_si128(flat, op1);
-  *p1 = _mm_or_si128(ps[1], op1);
-
-  work_a = _mm_andnot_si128(flat, *q2);
-  *q2 = _mm_and_si128(flat, oq2);
-  *q2 = _mm_or_si128(work_a, *q2);
-
-  work_a = _mm_andnot_si128(flat, *p2);
-  *p2 = _mm_and_si128(flat, op2);
-  *p2 = _mm_or_si128(work_a, *p2);
 }
 
 void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
index dea113a29..b9689202a 100644
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -110,7 +110,7 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
 }
 
 void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *zbin_ptr,
                                 const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 const int16_t *quant_shift_ptr,
@@ -120,12 +120,23 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   (void)scan;
   const unsigned int step = 8;
 
-  if (LIKELY(!skip_block)) {
-    __m256i qp[5], coeff;
-    init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
-    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  __m256i qp[5], coeff;
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
+  coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
 
-    __m256i eob = _mm256_setzero_si256();
+  while (n_coeffs > 0) {
+    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
     quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
 
     coeff_ptr += step;
@@ -133,40 +144,17 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dqcoeff_ptr += step;
     iscan += step;
     n_coeffs -= step;
-
-    update_qp(qp);
-
-    while (n_coeffs > 0) {
-      coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
-      quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
-
-      coeff_ptr += step;
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      iscan += step;
-      n_coeffs -= step;
-    }
-    {
-      __m256i eob_s;
-      eob_s = _mm256_shuffle_epi32(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 1);
-      eob = _mm256_max_epi16(eob, eob_s);
-      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                              _mm256_extractf128_si256(eob, 1));
-      *eob_ptr = _mm_extract_epi16(final_eob, 0);
-    }
-  } else {
-    do {
-      const __m256i zero = _mm256_setzero_si256();
-      _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero);
-      _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero);
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      n_coeffs -= step;
-    } while (n_coeffs > 0);
-    *eob_ptr = 0;
+  }
+  {
+    __m256i eob_s;
+    eob_s = _mm256_shuffle_epi32(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 1);
+    eob = _mm256_max_epi16(eob, eob_s);
+    const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                            _mm256_extractf128_si256(eob, 1));
+    *eob_ptr = _mm_extract_epi16(final_eob, 0);
   }
 }
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 5570ca5b7..58e5f98e5 100644
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -16,7 +16,7 @@
 #include "aom_ports/mem.h"
 
 void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *zbin_ptr,
                                 const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 const int16_t *quant_shift_ptr,
@@ -41,50 +41,48 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = ((int)count / 4) - 1; i >= 0; i--) {
-      __m128i coeffs, cmp1, cmp2;
-      int test;
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-      cmp1 = _mm_and_si128(cmp1, cmp2);
-      test = _mm_movemask_epi8(cmp1);
-      if (test == 0xffff)
-        non_zero_regs--;
-      else
-        break;
-    }
+  // Pre-scan pass
+  for (i = ((int)count / 4) - 1; i >= 0; i--) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (test == 0xffff)
+      non_zero_regs--;
+    else
+      break;
+  }
 
-    // Quantization pass:
-    for (i = 0; i < non_zero_regs; i++) {
-      __m128i coeffs, coeffs_sign, tmp1, tmp2;
-      int test;
-      int abs_coeff[4];
-      int coeff_sign[4];
-
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      coeffs_sign = _mm_srai_epi32(coeffs, 31);
-      coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
-      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
-      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
-      tmp1 = _mm_or_si128(tmp1, tmp2);
-      test = _mm_movemask_epi8(tmp1);
-      _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
-      _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
-
-      for (j = 0; j < 4; j++) {
-        if (test & (1 << (4 * j))) {
-          int k = 4 * i + j;
-          const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
-          const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
-          const uint32_t abs_qcoeff =
-              (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
-          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-          if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
-        }
+  // Quantization pass:
+  for (i = 0; i < non_zero_regs; i++) {
+    __m128i coeffs, coeffs_sign, tmp1, tmp2;
+    int test;
+    int abs_coeff[4];
+    int coeff_sign[4];
+
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    coeffs_sign = _mm_srai_epi32(coeffs, 31);
+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+    tmp1 = _mm_or_si128(tmp1, tmp2);
+    test = _mm_movemask_epi8(tmp1);
+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+    for (j = 0; j < 4; j++) {
+      if (test & (1 << (4 * j))) {
+        int k = 4 * i + j;
+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
       }
     }
   }
@@ -92,8 +90,8 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 }
 
 void aom_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
@@ -116,38 +114,35 @@ void aom_highbd_quantize_b_32x32_sse2(
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs / 4; i++) {
-      __m128i coeffs, cmp1, cmp2;
-      int test;
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-      cmp1 = _mm_and_si128(cmp1, cmp2);
-      test = _mm_movemask_epi8(cmp1);
-      if (!(test & 0xf)) idx_arr[idx++] = i * 4;
-      if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
-      if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
-      if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
-    }
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
 
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = idx_arr[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
-    }
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
   *eob_ptr = eob + 1;
 }
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
new file mode 100644
index 000000000..9b1b4c9de
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   uint32_t *sse, int *sum);
+
+void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
+                                const uint16_t *ref, int ref_stride,
+                                uint32_t *sse, int *sum) {
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+  for (int i = 0; i < 8; i += 2) {
+    const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
+    const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+    const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
+    const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
+    __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
+    __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
+    v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
+    v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
+    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+    src += src_stride * 2;
+    ref += ref_stride * 2;
+  }
+  __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
+  __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
+  __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
+  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+  *sum = _mm_extract_epi32(v_d, 0);
+  *sse = _mm_extract_epi32(v_d, 1);
+}
+
+void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
+                                  const uint16_t *ref, int ref_stride,
+                                  uint32_t *sse, int *sum) {
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  for (int i = 0; i < 16; ++i) {
+    const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
+    const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+    src += src_stride;
+    ref += ref_stride;
+  }
+  __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
+  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+  *sum = _mm_extract_epi32(v_d, 0);
+  *sse = _mm_extract_epi32(v_d, 1);
+}
+
+static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+#define VAR_FN(w, h, block_size, shift)                                    \
+  uint32_t aom_highbd_10_variance##w##x##h##_avx2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_10_variance_avx2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+VAR_FN(128, 128, 16, 14);
+VAR_FN(128, 64, 16, 13);
+VAR_FN(64, 128, 16, 13);
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+VAR_FN(16, 4, 16, 6);
+VAR_FN(8, 32, 8, 8);
+VAR_FN(32, 8, 8, 8);
+VAR_FN(16, 64, 16, 10);
+VAR_FN(64, 16, 16, 10);
+
+#undef VAR_FN
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index 131c16aa9..47b052abc 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -179,6 +179,9 @@ HIGH_GET_VAR(8);
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
+VAR_FN(128, 128, 16, 14);
+VAR_FN(128, 64, 16, 13);
+VAR_FN(64, 128, 16, 13);
 VAR_FN(64, 64, 16, 12);
 VAR_FN(64, 32, 16, 11);
 VAR_FN(32, 64, 16, 11);
@@ -590,10 +593,10 @@ FNS(sse2);
 void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
                                     const struct AV1Common *const cm,
                                     int mi_row, int mi_col, const MV *const mv,
-                                    uint16_t *comp_pred, int width, int height,
+                                    uint8_t *comp_pred8, int width, int height,
                                     int subpel_x_q3, int subpel_y_q3,
-                                    const uint8_t *ref8, int ref_stride,
-                                    int bd) {
+                                    const uint8_t *ref8, int ref_stride, int bd,
+                                    int subpel_search) {
   // expect xd == NULL only in tests
   if (xd != NULL) {
     const MB_MODE_INFO *mi = xd->mi[0];
@@ -606,8 +609,6 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
     if (is_scaled) {
       // Note: This is mostly a copy from the >=8X8 case in
       // build_inter_predictors() function, with some small tweaks.
-      uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);
-
       // Some assumptions.
       const int plane = 0;
 
@@ -661,7 +662,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
       warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
 
       // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
       const InterpFilters filters =
           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
 
@@ -677,10 +678,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
   }
 
   const InterpFilterParams *filter =
-      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+      (subpel_search == 1)
+          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
 
   if (!subpel_x_q3 && !subpel_y_q3) {
     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
     if (width >= 8) {
       int i;
       assert(!(width & 7));
@@ -711,13 +715,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
   } else if (!subpel_y_q3) {
     const int16_t *const kernel =
         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
-                               width, kernel, 16, NULL, -1, width, height, bd);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+                               NULL, -1, width, height, bd);
   } else if (!subpel_x_q3) {
     const int16_t *const kernel =
         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
-                              width, NULL, -1, kernel, 16, width, height, bd);
+    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                              kernel, 16, width, height, bd);
   } else {
     DECLARE_ALIGNED(16, uint16_t,
                     temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
@@ -734,30 +738,29 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
                                intermediate_height, bd);
     aom_highbd_convolve8_vert(
         CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
-        MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
-        16, width, height, bd);
+        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+        bd);
   }
 }
 
 void aom_highbd_comp_avg_upsampled_pred_sse2(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd) {
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  int n;
-  int i;
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+    int ref_stride, int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd);
+                            bd, subpel_search);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
   /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
   assert(!(width * height & 7));
-  n = width * height >> 3;
-  for (i = 0; i < n; i++) {
-    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred);
+  int n = width * height >> 3;
+  for (int i = 0; i < n; i++) {
+    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
     __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-    _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu16(s0, p0));
-    comp_pred += 8;
+    _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
+    comp_pred16 += 8;
     pred += 8;
   }
 }
@@ -777,7 +780,7 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
   xx_storeu_128(result, shift);
 }
 
-void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
+void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
                                        const uint8_t *pred8, int width,
                                        int height, const uint8_t *ref8,
                                        int ref_stride,
@@ -792,6 +795,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
       _mm_set_epi16(round, round, round, round, round, round, round, round);
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
 
   if (width >= 8) {
     // Read 8 pixels one row at a time
@@ -830,15 +834,16 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
 
 void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int subpel_search) {
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   int n;
   int i;
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd);
+                            bd, subpel_search);
   assert(!(width * height & 7));
   n = width * height >> 3;
 
@@ -850,13 +855,14 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
   const __m128i r =
       _mm_set_epi16(round, round, round, round, round, round, round, round);
 
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
   for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p0 = xx_loadu_128(comp_pred16);
     __m128i p1 = xx_loadu_128(pred);
 
-    highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+    highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
 
-    comp_pred += 8;
+    comp_pred16 += 8;
     pred += 8;
   }
 }
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
index 6c247a91b..df5449a9d 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -168,8 +168,8 @@ uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
                                                 bilinear_filters_2t[yoffset]);
 
-  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
-                           4);
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
 
   return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
                                   sse);
@@ -188,8 +188,8 @@ uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
                                                 bilinear_filters_2t[yoffset]);
 
-  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
-                           4);
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
 
   return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
                                    dst_stride, sse);
@@ -208,8 +208,8 @@ uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
                                                 bilinear_filters_2t[yoffset]);
 
-  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
-                           4);
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
 
   return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
                                    dst_stride, sse);
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
index eaf1f347b..f9a41a210 100644
--- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -120,11 +120,11 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
   int n;
   int i;
   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
   /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
   assert(!(width * height & 15));
   n = width * height >> 4;
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
deleted file mode 100644
index 18862dd3e..000000000
--- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
+++ /dev/null
@@ -1,916 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h> /* AVX2 */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-
-void aom_lpf_horizontal_16_avx2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
-  __m128i mask, hev, flat, flat2;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi8(1);
-  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
-  __m128i abs_p1p0;
-
-  const __m128i thresh =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
-  const __m128i blimit =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
-
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
-  q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
-  {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
-    abs_p1p0 =
-        _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
-    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    abs_p0q0 =
-        _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
-    abs_p1q1 =
-        _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
-        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi16(0x1);
-    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
-    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
-    __m128i qs0 = _mm_xor_si128(p0q0, t80);
-    __m128i qs1 = _mm_xor_si128(p1q1, t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, qs0ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    /* (aom_filter + 3 * (qs0 - ps0)) & mask */
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 0xB);
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 0xB);
-
-    /* Filter1 >> 3 */
-    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
-    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
-    /* filt >> 1 */
-    filt = _mm_adds_epi16(filter1, t1);
-    filt = _mm_srai_epi16(filt, 1);
-    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
-                            filt);
-    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
-    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
-    // loopfilter done
-
-    {
-      __m128i work;
-      flat = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
-          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
-      flat = _mm_max_epu8(abs_p1p0, flat);
-      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-      flat = _mm_subs_epu8(flat, one);
-      flat = _mm_cmpeq_epi8(flat, zero);
-      flat = _mm_and_si128(flat, mask);
-
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
-      q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
-
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
-      q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
-
-      flat2 = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
-          _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
-
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
-      q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
-
-      work = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
-          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
-
-      flat2 = _mm_max_epu8(work, flat2);
-      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-      flat2 = _mm_subs_epu8(flat2, one);
-      flat2 = _mm_cmpeq_epi8(flat2, zero);
-      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i four = _mm_set1_epi16(4);
-      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-      __m128i pixelFilter_p, pixelFilter_q;
-      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
-      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
-      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
-      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
-      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
-      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
-      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
-      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
-      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
-      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
-      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
-      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
-      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
-      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
-      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
-      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
-      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
-      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
-                                    _mm_add_epi16(p4_16, p3_16));
-      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
-                                    _mm_add_epi16(q4_16, q3_16));
-
-      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
-      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
-      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-      pixelFilter_p =
-          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
-      pixetFilter_p2p1p0 = _mm_add_epi16(
-          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
-      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
-
-      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(p7_16, p7_16);
-      sum_q7 = _mm_add_epi16(q7_16, q7_16);
-      sum_p3 = _mm_add_epi16(p3_16, p3_16);
-      sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
-      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
-      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
-      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
-      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
-      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
-      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
-      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
-      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    flat = _mm_shuffle_epi32(flat, 68);
-    flat2 = _mm_shuffle_epi32(flat2, 68);
-
-    q2p2 = _mm_andnot_si128(flat, q2p2);
-    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-    q6p6 = _mm_andnot_si128(flat2, q6p6);
-    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
-    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
-
-    q5p5 = _mm_andnot_si128(flat2, q5p5);
-    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
-
-    q4p4 = _mm_andnot_si128(flat2, q4p4);
-    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
-
-    q3p3 = _mm_andnot_si128(flat2, q3p3);
-    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
-
-    q2p2 = _mm_andnot_si128(flat2, q2p2);
-    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
-
-    q1p1 = _mm_andnot_si128(flat2, q1p1);
-    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
-
-    q0p0 = _mm_andnot_si128(flat2, q0p0);
-    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
-  }
-}
-
-DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
-  0, 128, 1, 128, 2,  128, 3,  128, 4,  128, 5,  128, 6,  128, 7,  128,
-  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
-};
-
-void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
-  __m128i mask, hev, flat, flat2;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi8(1);
-  __m128i p7, p6, p5;
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-  __m128i q5, q6, q7;
-  __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
-      p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
-
-  const __m128i thresh =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
-  const __m128i blimit =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
-
-  p256_4 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
-  p256_3 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
-  p256_2 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
-  p256_1 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
-  p256_0 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
-  q256_0 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
-  q256_1 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
-  q256_2 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
-  q256_3 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
-  q256_4 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
-
-  p4 = _mm256_castsi256_si128(p256_4);
-  p3 = _mm256_castsi256_si128(p256_3);
-  p2 = _mm256_castsi256_si128(p256_2);
-  p1 = _mm256_castsi256_si128(p256_1);
-  p0 = _mm256_castsi256_si128(p256_0);
-  q0 = _mm256_castsi256_si128(q256_0);
-  q1 = _mm256_castsi256_si128(q256_1);
-  q2 = _mm256_castsi256_si128(q256_2);
-  q3 = _mm256_castsi256_si128(q256_3);
-  q4 = _mm256_castsi256_si128(q256_4);
-
-  {
-    const __m128i abs_p1p0 =
-        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 =
-        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 =
-        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 =
-        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
-    __m128i work;
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
-        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
-        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    __m128i ps1 = _mm_xor_si128(p1, t80);
-    __m128i ps0 = _mm_xor_si128(p0, t80);
-    __m128i qs0 = _mm_xor_si128(q0, t80);
-    __m128i qs1 = _mm_xor_si128(q1, t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-    __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
-        flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
-        flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    /* (aom_filter + 3 * (qs0 - ps0)) & mask */
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    /* Filter1 >> 3 */
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-
-    /* Filter2 >> 3 */
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-
-    /* filt >> 1 */
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-    filt = _mm_andnot_si128(hev, filt);
-    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    // loopfilter done
-
-    {
-      __m128i work;
-      work = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
-          _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
-      flat = _mm_max_epu8(work, flat);
-      work = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
-          _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
-      flat = _mm_max_epu8(work, flat);
-      work = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
-          _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
-      flat = _mm_subs_epu8(flat, one);
-      flat = _mm_cmpeq_epi8(flat, zero);
-      flat = _mm_and_si128(flat, mask);
-
-      p256_5 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
-      q256_5 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
-      p5 = _mm256_castsi256_si128(p256_5);
-      q5 = _mm256_castsi256_si128(q256_5);
-      flat2 = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
-          _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
-
-      flat2 = _mm_max_epu8(work, flat2);
-      p256_6 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
-      q256_6 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
-      p6 = _mm256_castsi256_si128(p256_6);
-      q6 = _mm256_castsi256_si128(q256_6);
-      work = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
-          _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
-
-      flat2 = _mm_max_epu8(work, flat2);
-
-      p256_7 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
-      q256_7 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
-      p7 = _mm256_castsi256_si128(p256_7);
-      q7 = _mm256_castsi256_si128(q256_7);
-      work = _mm_max_epu8(
-          _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
-          _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
-
-      flat2 = _mm_max_epu8(work, flat2);
-      flat2 = _mm_subs_epu8(flat2, one);
-      flat2 = _mm_cmpeq_epi8(flat2, zero);
-      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-    {
-      const __m256i eight = _mm256_set1_epi16(8);
-      const __m256i four = _mm256_set1_epi16(4);
-      __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
-          pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
-      const __m256i filter =
-          _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
-      p256_7 = _mm256_shuffle_epi8(p256_7, filter);
-      p256_6 = _mm256_shuffle_epi8(p256_6, filter);
-      p256_5 = _mm256_shuffle_epi8(p256_5, filter);
-      p256_4 = _mm256_shuffle_epi8(p256_4, filter);
-      p256_3 = _mm256_shuffle_epi8(p256_3, filter);
-      p256_2 = _mm256_shuffle_epi8(p256_2, filter);
-      p256_1 = _mm256_shuffle_epi8(p256_1, filter);
-      p256_0 = _mm256_shuffle_epi8(p256_0, filter);
-      q256_0 = _mm256_shuffle_epi8(q256_0, filter);
-      q256_1 = _mm256_shuffle_epi8(q256_1, filter);
-      q256_2 = _mm256_shuffle_epi8(q256_2, filter);
-      q256_3 = _mm256_shuffle_epi8(q256_3, filter);
-      q256_4 = _mm256_shuffle_epi8(q256_4, filter);
-      q256_5 = _mm256_shuffle_epi8(q256_5, filter);
-      q256_6 = _mm256_shuffle_epi8(q256_6, filter);
-      q256_7 = _mm256_shuffle_epi8(q256_7, filter);
-
-      pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
-                                       _mm256_add_epi16(p256_4, p256_3));
-      pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
-                                       _mm256_add_epi16(q256_4, q256_3));
-
-      pixetFilter_p2p1p0 =
-          _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
-      pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-      pixetFilter_q2q1q0 =
-          _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
-      pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-
-      pixelFilter_p = _mm256_add_epi16(
-          eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
-
-      pixetFilter_p2p1p0 = _mm256_add_epi16(
-          four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-
-      res_p = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
-
-      flat2_p0 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
-      res_q = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
-
-      flat2_q0 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
-      res_p =
-          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
-                                             _mm256_add_epi16(p256_3, p256_0)),
-                            3);
-
-      flat_p0 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
-      res_q =
-          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
-                                             _mm256_add_epi16(q256_3, q256_0)),
-                            3);
-
-      flat_q0 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
-      sum_p7 = _mm256_add_epi16(p256_7, p256_7);
-
-      sum_q7 = _mm256_add_epi16(q256_7, q256_7);
-
-      sum_p3 = _mm256_add_epi16(p256_3, p256_3);
-
-      sum_q3 = _mm256_add_epi16(q256_3, q256_3);
-
-      pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
-
-      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
-
-      res_p = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
-
-      flat2_p1 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
-      res_q = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
-
-      flat2_q1 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
-      pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
-
-      pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
-
-      res_p =
-          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
-                                             _mm256_add_epi16(sum_p3, p256_1)),
-                            3);
-
-      flat_p1 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
-      res_q =
-          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
-                                             _mm256_add_epi16(sum_q3, q256_1)),
-                            3);
-
-      flat_q1 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
-      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-      sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
-
-      sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
-
-      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
-
-      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
-
-      res_p = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
-
-      flat2_p2 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
-      res_q = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
-
-      flat2_q2 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
-      pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
-
-      pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
-
-      res_p =
-          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
-                                             _mm256_add_epi16(sum_p3, p256_2)),
-                            3);
-
-      flat_p2 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
-      res_q =
-          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
-                                             _mm256_add_epi16(sum_q3, q256_2)),
-                            3);
-
-      flat_q2 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
-      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
-
-      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
-
-      res_p = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
-
-      flat2_p3 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
-      res_q = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
-
-      flat2_q3 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
-      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
-
-      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
-
-      res_p = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
-
-      flat2_p4 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
-      res_q = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
-
-      flat2_q4 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
-      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
-
-      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
-
-      res_p = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
-
-      flat2_p5 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
-      res_q = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
-
-      flat2_q5 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
-      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
-      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
-      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
-
-      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
-
-      res_p = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
-
-      flat2_p6 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
-      res_q = _mm256_srli_epi16(
-          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
-
-      flat2_q6 = _mm256_castsi256_si128(
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-    }
-
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    p2 = _mm_andnot_si128(flat, p2);
-    flat_p2 = _mm_and_si128(flat, flat_p2);
-    p2 = _mm_or_si128(flat_p2, p2);
-
-    p1 = _mm_andnot_si128(flat, ps1);
-    flat_p1 = _mm_and_si128(flat, flat_p1);
-    p1 = _mm_or_si128(flat_p1, p1);
-
-    p0 = _mm_andnot_si128(flat, ps0);
-    flat_p0 = _mm_and_si128(flat, flat_p0);
-    p0 = _mm_or_si128(flat_p0, p0);
-
-    q0 = _mm_andnot_si128(flat, qs0);
-    flat_q0 = _mm_and_si128(flat, flat_q0);
-    q0 = _mm_or_si128(flat_q0, q0);
-
-    q1 = _mm_andnot_si128(flat, qs1);
-    flat_q1 = _mm_and_si128(flat, flat_q1);
-    q1 = _mm_or_si128(flat_q1, q1);
-
-    q2 = _mm_andnot_si128(flat, q2);
-    flat_q2 = _mm_and_si128(flat, flat_q2);
-    q2 = _mm_or_si128(flat_q2, q2);
-
-    p6 = _mm_andnot_si128(flat2, p6);
-    flat2_p6 = _mm_and_si128(flat2, flat2_p6);
-    p6 = _mm_or_si128(flat2_p6, p6);
-    _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
-
-    p5 = _mm_andnot_si128(flat2, p5);
-    flat2_p5 = _mm_and_si128(flat2, flat2_p5);
-    p5 = _mm_or_si128(flat2_p5, p5);
-    _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
-
-    p4 = _mm_andnot_si128(flat2, p4);
-    flat2_p4 = _mm_and_si128(flat2, flat2_p4);
-    p4 = _mm_or_si128(flat2_p4, p4);
-    _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
-
-    p3 = _mm_andnot_si128(flat2, p3);
-    flat2_p3 = _mm_and_si128(flat2, flat2_p3);
-    p3 = _mm_or_si128(flat2_p3, p3);
-    _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
-
-    p2 = _mm_andnot_si128(flat2, p2);
-    flat2_p2 = _mm_and_si128(flat2, flat2_p2);
-    p2 = _mm_or_si128(flat2_p2, p2);
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-
-    p1 = _mm_andnot_si128(flat2, p1);
-    flat2_p1 = _mm_and_si128(flat2, flat2_p1);
-    p1 = _mm_or_si128(flat2_p1, p1);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-
-    p0 = _mm_andnot_si128(flat2, p0);
-    flat2_p0 = _mm_and_si128(flat2, flat2_p0);
-    p0 = _mm_or_si128(flat2_p0, p0);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-
-    q0 = _mm_andnot_si128(flat2, q0);
-    flat2_q0 = _mm_and_si128(flat2, flat2_q0);
-    q0 = _mm_or_si128(flat2_q0, q0);
-    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
-
-    q1 = _mm_andnot_si128(flat2, q1);
-    flat2_q1 = _mm_and_si128(flat2, flat2_q1);
-    q1 = _mm_or_si128(flat2_q1, q1);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-
-    q2 = _mm_andnot_si128(flat2, q2);
-    flat2_q2 = _mm_and_si128(flat2, flat2_q2);
-    q2 = _mm_or_si128(flat2_q2, q2);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-
-    q3 = _mm_andnot_si128(flat2, q3);
-    flat2_q3 = _mm_and_si128(flat2, flat2_q3);
-    q3 = _mm_or_si128(flat2_q3, q3);
-    _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
-
-    q4 = _mm_andnot_si128(flat2, q4);
-    flat2_q4 = _mm_and_si128(flat2, flat2_q4);
-    q4 = _mm_or_si128(flat2_q4, q4);
-    _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
-
-    q5 = _mm_andnot_si128(flat2, q5);
-    flat2_q5 = _mm_and_si128(flat2, flat2_q5);
-    q5 = _mm_or_si128(flat2_q5, q5);
-    _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
-
-    q6 = _mm_andnot_si128(flat2, q6);
-    flat2_q6 = _mm_and_si128(flat2, flat2_q6);
-    q6 = _mm_or_si128(flat2_q6, q6);
-    _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
-  }
-  _mm256_zeroupper();
-}
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
index f1eac233b..9d88b5e49 100644
--- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -249,6 +249,63 @@ static INLINE void transpose16x8_8x16_sse2(
   *d7 = _mm_unpackhi_epi64(w7, w15);
 }
 
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them  independently while flipping the second matrix horizontaly  Used for 14
+// taps filter pq pairs inverse
+static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
+                                            __m128i *x2, __m128i *x3,
+                                            __m128i *x4, __m128i *x5,
+                                            __m128i *x6, __m128i *x7,
+                                            __m128i *pq0, __m128i *pq1,
+                                            __m128i *pq2, __m128i *pq3) {
+  __m128i w10, w11, w12, w13;
+  __m128i w0, w1, w2, w3, w4, w5;
+  __m128i d0, d1, d2, d3;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w10 = _mm_unpacklo_epi8(
+      *x7, *x6);  // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
+  w11 = _mm_unpacklo_epi8(
+      *x5, *x4);  // q  xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
+  w12 = _mm_unpacklo_epi8(
+      *x3, *x2);  // q  xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
+  w13 = _mm_unpacklo_epi8(
+      *x1, *x0);  // q  xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
+
+  w4 = _mm_unpackhi_epi16(
+      w10, w11);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpackhi_epi16(
+      w12, w13);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  *pq0 = _mm_unpacklo_epi64(d0, d1);  // pq
+  *pq1 = _mm_unpackhi_epi64(d0, d1);  // pq
+  *pq2 = _mm_unpacklo_epi64(d2, d3);  // pq
+  *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
+}
+
 static INLINE void transpose8x16_16x8_sse2(
     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
@@ -300,9 +357,120 @@ static INLINE void transpose8x16_16x8_sse2(
   *d14d15 = _mm_unpackhi_epi64(w7, w15);
 }
 
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8  independently while flipping the second matrix horizontaly. Used
+// for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                        __m128i *x3, __m128i *q0p0,
+                                        __m128i *q1p1, __m128i *q2p2,
+                                        __m128i *q3p3, __m128i *q4p4,
+                                        __m128i *q5p5, __m128i *q6p6,
+                                        __m128i *q7p7) {
+  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi8(
+      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+  w3 = _mm_unpackhi_epi8(
+      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
+  ww2 = _mm_unpacklo_epi16(
+      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
+  ww3 = _mm_unpackhi_epi16(
+      w2,
+      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
+
+  *q7p7 = _mm_unpacklo_epi32(
+      ww0,
+      _mm_srli_si128(
+          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
+  *q6p6 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww0, 4),
+      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
+  *q5p5 = _mm_unpackhi_epi32(
+      ww0,
+      _mm_slli_si128(
+          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
+  *q4p4 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww0, 12),
+      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
+  *q3p3 = _mm_unpacklo_epi32(
+      ww1,
+      _mm_srli_si128(
+          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
+  *q2p2 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww1, 4),
+      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
+  *q1p1 = _mm_unpackhi_epi32(
+      ww1,
+      _mm_slli_si128(
+          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
+  *q0p0 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww1, 12),
+      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
+}
+
 static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
                                           __m128i *hev, __m128i *mask,
                                           __m128i *qs1qs0, __m128i *ps1ps0) {
+  __m128i filter, filter2filter1, work;
+  __m128i ps1ps0_work, qs1qs0_work;
+  __m128i hev1;
+  const __m128i t3t4 =
+      _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
+  const __m128i t80 = _mm_set1_epi8(0x80);
+  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+  filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
+  filter = _mm_and_si128(filter, *mask); /* & mask */
+  filter = _mm_unpacklo_epi32(filter, filter);
+
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+  filter2filter1 =
+      _mm_unpacklo_epi8(filter2filter1, filter2filter1);  // goto 16 bit
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);    /* >> 3 */
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
+
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */
+  filter = _mm_unpacklo_epi8(filter, filter);  // goto 16 bit
+  filter = _mm_srai_epi16(filter, 9);          /* round */
+  filter = _mm_packs_epi16(filter, filter);
+  filter = _mm_andnot_si128(*hev, filter);
+  filter = _mm_unpacklo_epi32(filter, filter);
+
+  filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
+  hev1 = _mm_srli_si128(filter2filter1, 8);
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+
+  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
+                                               __m128i *hev, __m128i *mask,
+                                               __m128i *qs1qs0,
+                                               __m128i *ps1ps0) {
   const __m128i t3t4 =
       _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
   const __m128i t80 = _mm_set1_epi8(0x80);
@@ -356,6 +524,49 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2(
     __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
   __m128i q1p1, q0p0, p1p0, q1q0;
   __m128i abs_p0q0, abs_p1q1;
+  __m128i mask, flat, hev;
+  const __m128i zero = _mm_setzero_si128();
+
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  /* (abs(q1 - q0), abs(p1 - p0) */
+  flat = abs_diff(q1p1, q0p0);
+  /* abs(p1 - q1), abs(p0 - q0) */
+  __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  hev = _mm_unpacklo_epi8(flat, zero);
+
+  hev = _mm_cmpgt_epi16(hev, *thresh);
+  hev = _mm_packs_epi16(hev, hev);
+  hev = _mm_unpacklo_epi32(hev, hev);
+
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+  abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4);           /* abs(p1 - q1) */
+  abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+  mask = _mm_unpacklo_epi32(mask, flat);
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
+
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+  __m128i q1p1, q0p0, p1p0, q1q0;
+  __m128i abs_p0q0, abs_p1q1;
   __m128i mask, hev;
   const __m128i zero = _mm_setzero_si128();
 
@@ -390,14 +601,14 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2(
   mask = _mm_cmpeq_epi8(mask, zero);
   mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
 
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
 }
 
 void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
                                const uint8_t *_blimit, const uint8_t *_limit,
                                const uint8_t *_thresh) {
   const __m128i zero = _mm_setzero_si128();
-  __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
                                      _mm_loadl_epi64((const __m128i *)_limit));
   __m128i thresh =
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
@@ -413,9 +624,9 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
   lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
 
   xx_storel_32(s - 1 * p, ps1ps0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8));
+  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
   xx_storel_32(s + 0 * p, qs1qs0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8));
+  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
 }
 
 void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
@@ -425,7 +636,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
   __m128i p1, p0, q0, q1;
 
   const __m128i zero = _mm_setzero_si128();
-  __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
                                      _mm_loadl_epi64((const __m128i *)_limit));
   __m128i thresh =
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
@@ -442,8 +653,8 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
   lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
 
   // Transpose 8x4 to 4x8
-  p1 = _mm_srli_si128(p1p0, 8);
-  q1 = _mm_srli_si128(q1q0, 8);
+  p1 = _mm_srli_si128(p1p0, 4);
+  q1 = _mm_srli_si128(q1q0, 4);
 
   transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
 
@@ -455,10 +666,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
 
 static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
   xx_storel_32(s - (num + 1) * p, x);
-  xx_storel_32(s + num * p, _mm_srli_si128(x, 8));
+  xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
 }
 
-static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
     __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
     __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
     __m128i *thresh) {
@@ -503,38 +714,31 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2(
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
-    // replicate for the further "merged variables" usage
-    mask = _mm_unpacklo_epi64(mask, mask);
   }
 
   // lp filter - the same for 6, 8 and 14 versions
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
   qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
   qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
   // loopfilter done
 
   __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
   __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-  {
-    __m128i work;
-    flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
-    flat = _mm_max_epu8(abs_p1p0, flat);
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
 
-    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
-    work = abs_diff(*q6p6, *q0p0);
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-  }
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // flat and wide flat calculations
-  {
+  __m128i work;
+  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+
     const __m128i eight = _mm_set1_epi16(8);
     const __m128i four = _mm_set1_epi16(4);
     __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
@@ -619,137 +823,413 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2(
         _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
     flat_q1p1 = _mm_packus_epi16(res_p, res_q);
 
-    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_p,
-            _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
-        4);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_q,
-            _mm_add_epi16(sum_q6,
-                          _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
-        4);
-    flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
     pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
     pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
 
+    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
     res_p = _mm_srli_epi16(
         _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
     res_q = _mm_srli_epi16(
         _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
     flat_q2p2 = _mm_packus_epi16(res_p, res_q);
 
-    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_p,
-            _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
-        4);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_q,
-            _mm_add_epi16(sum_q6,
-                          _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
-        4);
-    flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_p,
-            _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
-        4);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_q,
-            _mm_add_epi16(sum_q6,
-                          _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
-        4);
-    flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+    // work with flat2
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
 
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_p,
-            _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
-        4);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_q,
-            _mm_add_epi16(sum_q6,
-                          _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
-        4);
-    flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    flat = _mm_unpacklo_epi64(flat, flat);
+    *q2p2 = _mm_andnot_si128(flat, *q2p2);
+    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+    *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+    *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+    *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+          4);
+      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+          4);
+      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+          4);
+      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+          4);
+      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+      // wide flat
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+      flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+      *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+
+      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+      flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+      *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+
+      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+      flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+      *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+
+      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+      flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+      *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+
+      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+      flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+      *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+
+      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+      flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+      *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+    }
+  } else {
+    *q0p0 = qs0ps0;
+    *q1p1 = qs1ps1;
   }
-  // wide flat
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+}
 
-  flat = _mm_shuffle_epi32(flat, 68);
-  flat2 = _mm_shuffle_epi32(flat2, 68);
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi8(1);
+  __m128i mask, hev, flat, flat2;
+  __m128i flat2_pq[6], flat_pq[3];
+  __m128i qs0ps0, qs1ps1;
+  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+  __m128i abs_p1p0;
 
-  *q2p2 = _mm_andnot_si128(flat, *q2p2);
-  flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-  *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+  p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
+  q1q0 = _mm_srli_si128(p1p0, 8);
 
-  qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-  flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-  *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+  __m128i fe, ff, work;
+  {
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+    abs_p1p0 = abs_diff(*q1p1, *q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+    fe = _mm_set1_epi8(0xfe);
+    ff = _mm_cmpeq_epi8(fe, fe);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
 
-  qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-  flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-  *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
 
-  *q5p5 = _mm_andnot_si128(flat2, *q5p5);
-  flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-  *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi32(hev, hev);
 
-  *q4p4 = _mm_andnot_si128(flat2, *q4p4);
-  flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-  *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_unpacklo_epi32(mask, zero);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
 
-  *q3p3 = _mm_andnot_si128(flat2, *q3p3);
-  flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-  *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
 
-  *q2p2 = _mm_andnot_si128(flat2, *q2p2);
-  flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-  *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
+  qs1ps1 = _mm_srli_si128(qs0ps0, 8);
+  // loopfilter done
 
-  *q1p1 = _mm_andnot_si128(flat2, *q1p1);
-  flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-  *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  flat = _mm_unpacklo_epi32(flat, flat);
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+    __m128i pq_16[7];
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p6;
+    __m128i sum_p3;
+
+    pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
+    pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
+    pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
+    pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
+    pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
+    pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
+    pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
+    q0_16 = _mm_srli_si128(pq_16[0], 8);
+    q1_16 = _mm_srli_si128(pq_16[1], 8);
+    q2_16 = _mm_srli_si128(pq_16[2], 8);
+    q3_16 = _mm_srli_si128(pq_16[3], 8);
+    q4_16 = _mm_srli_si128(pq_16[4], 8);
+    q5_16 = _mm_srli_si128(pq_16[5], 8);
+
+    __m128i flat_p[3], flat_q[3];
+    __m128i flat2_p[6], flat2_q[6];
+
+    __m128i work0, work0_0, work0_1, sum_p_0;
+    __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
+    __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
+    sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+    __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
+    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
+
+    sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
+    sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
+
+    sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
+    sum_p = _mm_sub_epi16(sum_p_0, q5_16);
+
+    work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
+    work0_1 = _mm_add_epi16(
+        sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
+
+    sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q2_16);
+
+    work0 = _mm_add_epi16(sum_p3, pq_16[1]);
+    flat_p[1] = _mm_add_epi16(sum_lp, work0);
+    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+    flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
+    flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
+
+    sum_lp = _mm_sub_epi16(sum_lp, q1_16);
+    sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
+
+    sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
+    work0 = _mm_add_epi16(sum_p3, pq_16[2]);
+
+    flat_p[2] = _mm_add_epi16(sum_lp, work0);
+    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+    flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
 
-  *q0p0 = _mm_andnot_si128(flat2, *q0p0);
-  flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-  *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    flat2 = _mm_unpacklo_epi32(flat2, flat2);
+
+    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
+    *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
+    *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
+
+    *q2p2 = _mm_andnot_si128(flat, *q2p2);
+    flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
+    *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
+      flat2_q[0] = _mm_add_epi16(
+          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
+
+      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+      flat2_pq[0] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+      flat2_pq[1] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+      flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
+      flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
+
+      sum_p = _mm_sub_epi16(sum_p, q4_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
+      flat2_p[2] = _mm_add_epi16(sum_p, work0);
+      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[2] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+      flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q3_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
+      flat2_p[3] = _mm_add_epi16(sum_p, work0);
+      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[3] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+      flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q2_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
+      flat2_p[4] = _mm_add_epi16(sum_p, work0);
+      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[4] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+      flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q1_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
+      flat2_p[5] = _mm_add_epi16(sum_p, work0);
+      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[5] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+      flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
+
+      // wide flat
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+      flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
+      *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
+
+      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+      flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
+      *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
+
+      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+      flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
+      *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
+
+      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+      flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
+      *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
+
+      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+      flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
+      *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
+
+      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+      flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
+      *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
+    }
+  } else {
+    *q0p0 = qs0ps0;
+    *q1p1 = qs1ps1;
+  }
 }
 
 void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
@@ -761,22 +1241,22 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
   __m128i limit = _mm_load_si128((const __m128i *)_limit);
   __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
 
-  q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
+  q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
                             _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
-  q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
+  q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
                             _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
+  q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
                             _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+  q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
                             _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
 
-  q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+  q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
                             _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
 
-  q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
+  q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
                             _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
 
-  q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
+  q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
                             _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
 
   lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
@@ -790,7 +1270,7 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
   store_buffer_horz_8(q5p5, p, 5, s);
 }
 
-static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
     __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
     __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
     __m128i *thresh) {
@@ -810,6 +1290,7 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
   const __m128i one = _mm_set1_epi8(1);
   const __m128i fe = _mm_set1_epi8(0xfe);
   const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+
   {
     // filter_mask and hev_mask
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
@@ -847,8 +1328,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
-    // replicate for the further "merged variables" usage
-    mask = _mm_unpacklo_epi64(mask, mask);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
 
     // flat_mask
     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
@@ -861,9 +1343,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
   }
 
   // 5 tap filter
-  {
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
     const __m128i four = _mm_set1_epi16(4);
-
     __m128i workp_a, workp_b, workp_shft0, workp_shft1;
     p2_16 = _mm_unpacklo_epi8(*p2, zero);
     p1_16 = _mm_unpacklo_epi8(*p1, zero);
@@ -906,18 +1388,149 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
                                  3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
 
     flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+    *q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+    *p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
   }
+}
 
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+  __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
+  __m128i ps1ps0, qs1qs0;
+
+  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
+  *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+  {
+    // filter_mask and hev_mask
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+    abs_p0q0 = abs_diff(*p1p0, *q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+    // considering sse doesn't have unsigned elements comparison the idea is
+    // to find at least one case when X > limit, it means the corresponding
+    // mask bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi32(hev, hev);
 
-  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
-  *q1q0 = _mm_and_si128(flat, flat_q0q1);
-  *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_unpacklo_epi32(mask, zero);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
 
-  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
-  *p1p0 = _mm_and_si128(flat, flat_p1p0);
-  *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+    work = abs_diff(q2p2, q1p1);
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+    // flat_mask
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi32(flat, flat);
+    flat = _mm_unpacklo_epi64(flat, flat);
+  }
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_c;
+    __m128i pq0x2_pq1, pq1_pq2;
+    pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
+    pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
+    pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
+    q0_16 = _mm_srli_si128(pq0_16, 8);
+    q2_16 = _mm_srli_si128(pq2_16, 8);
+
+    // op1
+    pq0x2_pq1 =
+        _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16);  // p0 *2 + p1
+    pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16);                   // p1 + p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
+    workp_b =
+        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+    workp_b = _mm_srli_epi16(workp_b, 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
+                            pq1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_srli_si128(pq1_pq2, 8);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
+                            pq0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(q2_16, q2_16);
+    workp_b =
+        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+    workp_a = _mm_srli_epi16(workp_a, 3);
+
+    flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+    *q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+    *p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+  }
 }
 
 void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
@@ -941,9 +1554,9 @@ void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
                       &limit, &thresh);
 
   xx_storel_32(s - 1 * p, p1p0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
   xx_storel_32(s + 0 * p, q1q0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
 }
 
 void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
@@ -970,8 +1583,8 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
   q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
   q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
 
-  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
-                      &limit, &thresh);
+  lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                           &limit, &thresh);
 
   _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
   _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
@@ -982,15 +1595,168 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
 static AOM_FORCE_INLINE void lpf_internal_8_sse2(
     __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
     __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
-    __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
+    __m128i *blimit, __m128i *limit, __m128i *thresh) {
   const __m128i zero = _mm_setzero_si128();
   __m128i mask, hev, flat;
   __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
       flat_p1p0, flat_q0q1;
   __m128i q2p2, q1p1, q0p0;
   __m128i q1q0, p1p0, ps1ps0, qs1qs0;
-  __m128i work_a, op2, oq2;
+  __m128i work_pq, opq2, pq2;
+
+  q3p3 = _mm_unpacklo_epi32(*p3, *q3);
+  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);  // p1p0 q1q0
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  // filter_mask and hev_mask
+
+  // considering sse doesn't have unsigned elements comparison the idea is to
+  // find at least one case when X > limit, it means the corresponding  mask
+  // bit is set.
+  // to achieve that we find global max value of all inputs of abs(x-y) or
+  // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+  // otherwise - not
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+  abs_p1p0 = abs_diff(q1p1, q0p0);
+  abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+  abs_p0q0 = abs_diff(p1p0, q1q0);
+  abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+  flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu8(flat, *thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+  // replicate for the further "merged variables" usage
+  hev = _mm_unpacklo_epi32(hev, hev);
+
+  abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+  mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+  mask = _mm_unpacklo_epi32(mask, zero);
+  mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  mask = _mm_max_epu8(abs_p1p0, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  // mask |= (abs(q1 - q0) > limit) * -1;
+
+  work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+  mask = _mm_max_epu8(work, mask);
+  mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+  // flat_mask4
+  flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi32(flat, flat);
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+    p3_16 = _mm_unpacklo_epi8(*p3, zero);
+    q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+    // op2
+    workp_a =
+        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+    workp_shft2 = _mm_add_epi16(workp_a, workp_b);
+
+    // op1
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+    workp_c = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+    workp_d = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+    flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    workp_c = _mm_add_epi16(workp_a, workp_b);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+    workp_d = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+    flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+
+    opq2 = _mm_packus_epi16(workp_c, workp_c);
+
+    work_pq = _mm_andnot_si128(flat, q2p2);
+    pq2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_pq, pq2);
+    *q2 = _mm_srli_si128(*p2, 4);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    __m128i *blimit, __m128i *limit, __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+      flat_p1p0, flat_q0q1;
+  __m128i q2p2, q1p1, q0p0;
+  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+  __m128i work_pq, opq2, pq2;
 
   q3p3 = _mm_unpacklo_epi64(*p3, *q3);
   q2p2 = _mm_unpacklo_epi64(*p2, *q2);
@@ -1043,11 +1809,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
-    // replicate for the further "merged variables" usage
-    mask = _mm_unpacklo_epi64(mask, mask);
 
-    // flat_mask4
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
 
+    // flat_mask4
     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
     flat = _mm_max_epu8(abs_p1p0, flat);
 
@@ -1059,11 +1825,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
     flat = _mm_unpacklo_epi64(flat, flat);
   }
 
-  // filter8
-  {
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
     const __m128i four = _mm_set1_epi16(4);
 
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
     p2_16 = _mm_unpacklo_epi8(*p2, zero);
     p1_16 = _mm_unpacklo_epi8(*p1, zero);
     p0_16 = _mm_unpacklo_epi8(*p0, zero);
@@ -1078,8 +1844,7 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
         _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
     workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
     workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
-    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-    op2 = _mm_packus_epi16(workp_shft0, workp_shft0);
+    workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 
     // op1
     workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
@@ -1108,27 +1873,22 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
     workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-    oq2 = _mm_packus_epi16(workp_shft1, workp_shft1);
-  }
 
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+    opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
 
-  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
-  q1q0 = _mm_and_si128(flat, flat_q0q1);
-  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+    work_pq = _mm_andnot_si128(flat, q2p2);
+    pq2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_pq, pq2);
+    *q2 = _mm_srli_si128(*p2, 8);
 
-  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
-  p1p0 = _mm_and_si128(flat, flat_p1p0);
-  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 
-  work_a = _mm_andnot_si128(flat, *q2);
-  q2_16 = _mm_and_si128(flat, oq2);
-  *q2_out = _mm_or_si128(work_a, q2_16);
-
-  work_a = _mm_andnot_si128(flat, *p2);
-  p2_16 = _mm_and_si128(flat, op2);
-  *p2_out = _mm_or_si128(work_a, p2_16);
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
 }
 
 void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
@@ -1136,7 +1896,7 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
                                const unsigned char *_limit,
                                const unsigned char *_thresh) {
   __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-  __m128i q1q0, p1p0, p2_out, q2_out;
+  __m128i q1q0, p1p0;
   __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
   __m128i limit = _mm_load_si128((const __m128i *)_limit);
   __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
@@ -1151,14 +1911,14 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
   q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
 
   lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
-                      &p2_out, &q2_out, &blimit, &limit, &thresh);
+                      &blimit, &limit, &thresh);
 
   xx_storel_32(s - 1 * p, p1p0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
   xx_storel_32(s + 0 * p, q1q0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
-  xx_storel_32(s - 3 * p, p2_out);
-  xx_storel_32(s + 2 * p, q2_out);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+  xx_storel_32(s - 3 * p, p2);
+  xx_storel_32(s + 2 * p, q2);
 }
 
 void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
@@ -1196,8 +1956,8 @@ void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
   q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
                             _mm_loadl_epi64((__m128i *)(s + 6 * p)));
 
-  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
-                       &limit, &thresh);
+  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                            &blimit, &limit, &thresh);
 
   _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
   _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
@@ -1227,7 +1987,7 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
                                       _mm_load_si128((__m128i *)_thresh1));
 
   __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-  __m128i q1q0, p1p0, p2_out, q2_out;
+  __m128i q1q0, p1p0;
 
   p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
   p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
@@ -1238,15 +1998,15 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
   q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
   q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
 
-  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
-                      &p2_out, &q2_out, &blimit, &limit, &thresh);
+  lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                           &blimit, &limit, &thresh);
 
   _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
   _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
   _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
   _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-  _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out);
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
 }
 
 void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
@@ -1282,7 +2042,7 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
 
   __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
 
-  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
 
   _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
   _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
@@ -1331,7 +2091,7 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
   transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
                         &q1);
 
-  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
 
   p1 = _mm_srli_si128(ps1ps0, 8);
   q1 = _mm_srli_si128(qs1qs0, 8);
@@ -1372,8 +2132,8 @@ void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
   lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
                       &limit, &thresh);
 
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
+  p0 = _mm_srli_si128(p1p0, 4);
+  q0 = _mm_srli_si128(q1q0, 4);
 
   transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
 
@@ -1419,8 +2179,8 @@ void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
   d5 = _mm_srli_si128(d4d5, 8);
   d7 = _mm_srli_si128(d6d7, 8);
 
-  lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit,
-                      &limit, &thresh);
+  lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
+                           &blimit, &limit, &thresh);
 
   p0 = _mm_srli_si128(p1p0, 8);
   q0 = _mm_srli_si128(q1q0, 8);
@@ -1444,7 +2204,7 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
                              const unsigned char *_thresh) {
   __m128i d0, d1, d2, d3, d4, d5, d6, d7;
 
-  __m128i p2, p0, q0, q2;
+  __m128i p0, q0;
   __m128i x2, x1, x0, x3;
   __m128i q1q0, p1p0;
   __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -1459,13 +2219,13 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
   transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
                         &d7);
   // Loop filtering
-  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2,
-                      &q2, &blimit, &limit, &thresh);
+  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
+                      &blimit, &limit, &thresh);
 
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
+  p0 = _mm_srli_si128(p1p0, 4);
+  q0 = _mm_srli_si128(q1q0, 4);
 
-  transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1,
+  transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
                         &d2, &d3);
 
   _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
@@ -1490,7 +2250,7 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i d1, d3, d5, d7;
   __m128i q1q0, p1p0;
-  __m128i p2, p1, q1, q2;
+  __m128i p1, q1;
   __m128i d0d1, d2d3, d4d5, d6d7;
 
   x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
@@ -1510,14 +2270,14 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
   d5 = _mm_srli_si128(d4d5, 8);
   d7 = _mm_srli_si128(d6d7, 8);
 
-  lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0,
-                      &p1p0, &p2, &q2, &blimit, &limit, &thresh);
+  lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
+                           &q1q0, &p1p0, &blimit, &limit, &thresh);
 
   p1 = _mm_srli_si128(p1p0, 8);
   q1 = _mm_srli_si128(q1q0, 8);
 
-  transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3,
-                    &d4d5, &d6d7);
+  transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
+                    &d2d3, &d4d5, &d6d7);
 
   _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
   _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
@@ -1533,65 +2293,30 @@ void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
                               const unsigned char *_blimit,
                               const unsigned char *_limit,
                               const unsigned char *_thresh) {
-  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i x6, x5, x4, x3, x2, x1, x0;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-  __m128i q0, q1, q2, q3, q4, q5, q6, q7;
-  __m128i p0_out, p1_out, p2_out, p3_out;
+  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x6, x5, x4, x3;
+  __m128i pq0, pq1, pq2, pq3;
   __m128i blimit = _mm_load_si128((__m128i *)_blimit);
   __m128i limit = _mm_load_si128((__m128i *)_limit);
   __m128i thresh = _mm_load_si128((__m128i *)_thresh);
 
-  x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p));
-  x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p));
-  x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p));
-
-  transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6,
-                        &p7);
+  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
 
-  x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
-  x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-
-  transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6,
-                        &q7);
-
-  q6p6 = _mm_unpacklo_epi64(p1, q6);
-  q5p5 = _mm_unpacklo_epi64(p2, q5);
-  q4p4 = _mm_unpacklo_epi64(p3, q4);
-  q3p3 = _mm_unpacklo_epi64(p4, q3);
-  q2p2 = _mm_unpacklo_epi64(p5, q2);
-  q1p1 = _mm_unpacklo_epi64(p6, q1);
-  q0p0 = _mm_unpacklo_epi64(p7, q0);
+  transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
+                       &q5p5, &q6p6, &q7p7);
 
   lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
                        &limit, &thresh);
 
-  transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
-                        &p0_out, &p1_out, &p2_out, &p3_out);
-
-  x0 = _mm_srli_si128(q0p0, 8);
-  x1 = _mm_srli_si128(q1p1, 8);
-  x2 = _mm_srli_si128(q2p2, 8);
-  x3 = _mm_srli_si128(q3p3, 8);
-  x4 = _mm_srli_si128(q4p4, 8);
-  x5 = _mm_srli_si128(q5p5, 8);
-  x6 = _mm_srli_si128(q6p6, 8);
-
-  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2,
-                        &q3);
-
-  _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out);
-  _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out);
-  _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out);
-  _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out);
-
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-  _mm_storel_epi64((__m128i *)(s + 3 * p), q3);
+  transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+                           &q0p0, &pq0, &pq1, &pq2, &pq3);
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
 }
 
 void aom_lpf_vertical_14_dual_sse2(
@@ -1634,8 +2359,8 @@ void aom_lpf_vertical_14_dual_sse2(
   q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
   q7 = _mm_srli_si128(d14d15, 8);
 
-  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
-                       &limit, &thresh);
+  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                            &blimit, &limit, &thresh);
 
   x0 = _mm_srli_si128(q0p0, 8);
   x1 = _mm_srli_si128(q1p1, 8);
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
index c6b6469b4..8970fe7dd 100644
--- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H
-#define _AOM_DSP_X86_LPF_COMMON_X86_H
+#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
 
@@ -212,4 +212,4 @@ static INLINE void highbd_transpose8x16_sse2(
                            d4 + 1, d5 + 1, d6 + 1, d7 + 1);
 }
 
-#endif  // _AOM_DSP_X86_LPF_COMMON_X86_H
+#endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
index 6538e4d5e..584b5e7e3 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -9,7 +9,6 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <stdio.h>
 #include <tmmintrin.h>
 
 #include "config/aom_config.h"
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
index 19b429d91..cffbd9672 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H
-#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H
+#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
 
 unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *a_ptr, int a_stride,
@@ -30,4 +30,4 @@ unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
                                             const uint8_t *m_ptr, int m_stride,
                                             int height);
 
-#endif
+#endif  // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
index dc41a8342..4faa098ac 100644
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H
-#define _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H
+#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
 
 #include <stdlib.h>
 #include <string.h>
@@ -89,4 +89,4 @@ static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
   } while (i < height);
 }
 
-#endif
+#endif  // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
index 8b69606dd..6c821673e 100644
--- a/third_party/aom/aom_dsp/x86/mem_sse2.h
+++ b/third_party/aom/aom_dsp/x86/mem_sse2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_MEM_SSE2_H_
-#define AOM_DSP_X86_MEM_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_AOM_DSP_X86_MEM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
 
@@ -39,4 +39,4 @@ static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
   return dst;
 }
 
-#endif  // AOM_DSP_X86_MEM_SSE2_H_
+#endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
new file mode 100644
index 000000000..5181e444c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+
+static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    unsigned int *const sse, int *const sum,
+                                    const int h) {
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
+    const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
+    const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
+
+    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
index a3535f985..48486c6c4 100644
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
-#define AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
 
 #include <immintrin.h>
 
@@ -42,4 +42,13 @@ static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
   return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
 }
 
-#endif  // AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
+  const __m128i v_tmp_d =
+      _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
new file mode 100644
index 000000000..bfec0e8a8
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+                                     const int32_t *wsrc, const int32_t *mask,
+                                     unsigned int *const sse, int *const sum,
+                                     const int w, const int h) {
+  int n = 0, width, height = h;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  __m128i v_d;
+  const uint8_t *pre_temp;
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+  do {
+    width = w;
+    pre_temp = pre;
+    do {
+      const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
+      const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+      const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+
+      // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+      // boundaries. We use pmaddwd, as it has lower latency on Haswell
+      // than pmulld but produces the same result with these inputs.
+      const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
+      const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+
+      const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
+      const __m256i v_tmp_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
+      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
+      const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
+      const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
+
+      const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
+      const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+      v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+      pre_temp += 8;
+      n += 8;
+      width -= 8;
+    } while (width > 0);
+    pre += pre_stride;
+    height -= 1;
+  } while (height > 0);
+  v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  *sum = _mm_cvtsi128_si32(v_d);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
+}
+
+static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
+                                      const int32_t *wsrc, const int32_t *mask,
+                                      unsigned int *const sse, int *const sum,
+                                      const int w, const int h) {
+  int n = 0, width, height = h;
+  __m256i v_d;
+  __m128i res0;
+  const uint8_t *pre_temp;
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+
+  assert(w >= 16);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+  do {
+    width = w;
+    pre_temp = pre;
+    do {
+      const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
+      const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+      const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+      const __m256i v_m1_d =
+          _mm256_loadu_si256((__m256i const *)(mask + n + 8));
+      const __m256i v_w1_d =
+          _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
+
+      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+      const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
+
+      const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+      const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
+
+      const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+      const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
+
+      const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
+      const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
+
+      const __m256i v_tmp0_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
+      const __m256i v_tmp1_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
+
+      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
+      const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
+
+      const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
+      const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
+      const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+      v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
+      v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
+
+      pre_temp += 16;
+      n += 16;
+      width -= 16;
+    } while (width > 0);
+    pre += pre_stride;
+    height -= 1;
+  } while (height > 0);
+
+  v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
+  v_d = _mm256_hadd_epi32(v_d, v_d);
+  res0 = _mm256_castsi256_si128(v_d);
+  res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
+  *sum = _mm_cvtsi128_si32(res0);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
+}
+
+#define OBMCVARWXH(W, H)                                                \
+  unsigned int aom_obmc_variance##W##x##H##_avx2(                       \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,          \
+      const int32_t *mask, unsigned int *sse) {                         \
+    int sum;                                                            \
+    if (W == 4) {                                                       \
+      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);      \
+    } else if (W == 8) {                                                \
+      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H);  \
+    } else {                                                            \
+      obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+    }                                                                   \
+                                                                        \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));       \
+  }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
index 2e2f6e09f..72eda0e57 100644
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -19,7 +19,7 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
 #include "aom_dsp/x86/synonyms.h"
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -36,45 +36,6 @@ void aom_var_filter_block2d_bil_second_pass_ssse3(
     unsigned int pixel_step, unsigned int output_height,
     unsigned int output_width, const uint8_t *filter);
 
-static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
-                                    const int32_t *wsrc, const int32_t *mask,
-                                    unsigned int *const sse, int *const sum,
-                                    const int h) {
-  const int pre_step = pre_stride - 4;
-  int n = 0;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-
-  assert(IS_POWER_OF_TWO(h));
-
-  do {
-    const __m128i v_p_b = xx_loadl_32(pre + n);
-    const __m128i v_m_d = xx_load_128(mask + n);
-    const __m128i v_w_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
-    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
-    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
-    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
-
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
-    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-    n += 4;
-
-    if (n % 4 == 0) pre += pre_step;
-  } while (n < 4 * h);
-
-  *sum = xx_hsum_epi32_si32(v_sum_d);
-  *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
 static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
                                      const int32_t *wsrc, const int32_t *mask,
                                      unsigned int *const sse, int *const sum,
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
index e6b40262d..216a0bd8f 100644
--- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -16,16 +16,12 @@
 SECTION .text
 
 %macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
                                 shift, qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
 
   vzeroupper
 
-  ; If we can skip this block, then just zero the output
-  cmp                         skipmp, 0
-  jne .blank
-
 %ifnidn %1, b_32x32
 
   ; Special case for ncoeff == 16, as it is frequent and we can save on
@@ -83,14 +79,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 .single_nonzero:
 
   ; Actual quantization of size 16 block - setup pointers, rounders, etc.
-  movifnidn                       r4, roundmp
-  movifnidn                       r5, quantmp
-  mov                             r3, dequantmp
-  mov                             r6, shiftmp
-  mova                            m1, [r4]              ; m1 = round
-  mova                            m2, [r5]              ; m2 = quant
-  mova                            m3, [r3]              ; m3 = dequant
-  mova                            m4, [r6]              ; m4 = shift
+  movifnidn                       r3, roundmp
+  movifnidn                       r4, quantmp
+  mov                             r6, dequantmp
+  mov                             r5, shiftmp
+  mova                            m1, [r3]              ; m1 = round
+  mova                            m2, [r4]              ; m2 = quant
+  mova                            m3, [r6]              ; m3 = dequant
+  mova                            m4, [r5]              ; m4 = shift
 
   mov                             r3, iscanmp
 
@@ -174,20 +170,20 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 
 %endif ; %ifnidn %1, b_32x32
 
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
             qcoeff, dqcoeff, dequant, eob, scan, iscan
 
   ; Actual quantization loop - setup pointers, rounders, etc.
   movifnidn                   coeffq, coeffmp
   movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
   movifnidn                    zbinq, zbinmp
   movifnidn                   roundq, roundmp
   movifnidn                   quantq, quantmp
+  movifnidn                 dequantq, dequantmp
   mova                            m0, [zbinq]              ; m0 = zbin
   mova                            m1, [roundq]             ; m1 = round
   mova                            m2, [quantq]             ; m2 = quant
-  mova                            m3, [r2]                 ; m3 = dequant
+  mova                            m3, [dequantq]           ; m3 = dequant
   pcmpeqw                         m4, m4                   ; All lanes -1
 %ifidn %1, b_32x32
   psubw                           m0, m4
@@ -199,7 +195,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 
   mov                             r2, shiftmp
   mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
+  mova                            m4, [r2]            ; m4 = shift
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
 %ifidn %1, b_32x32
@@ -207,7 +203,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
 
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
 
 
   lea                         coeffq, [  coeffq+ncoeffq*4]
@@ -432,39 +428,8 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   mov                           [r2], ax
   vzeroupper
   RET
-
-  ; Skip-block, i.e. just write all zeroes
-.blank:
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-
-DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  neg                        ncoeffq
-  pxor                            m7, m7
-
-.blank_loop:
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
-  mova       [dqcoeffq+ncoeffq*4+32], ymm7
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*4+32], ymm7
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-
-  mov                         [eobq], word 0
-
-  vzeroupper
-  RET
 %endmacro
 
 INIT_XMM avx
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
index 46b9c7d29..d3de6e24d 100644
--- a/third_party/aom/aom_dsp/x86/quantize_sse2.c
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -9,242 +9,139 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <assert.h>
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
 
 static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-  if (sizeof(tran_low_t) == 4) {
-    return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                          (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                          (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                          (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-  } else {
-    return _mm_load_si128((const __m128i *)coeff_ptr);
-  }
+  assert(sizeof(tran_low_t) == 4);
+
+  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
+                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
+                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
 }
 
 static INLINE void store_coefficients(__m128i coeff_vals,
                                       tran_low_t *coeff_ptr) {
-  if (sizeof(tran_low_t) == 4) {
-    __m128i one = _mm_set1_epi16(1);
-    __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-    __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-    __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-    __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-    _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-    _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-  } else {
-    _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
-  }
+  assert(sizeof(tran_low_t) == 4);
+
+  __m128i one = _mm_set1_epi16(1);
+  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
 }
 
 void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan_ptr,
                          const int16_t *iscan_ptr) {
-  __m128i zero;
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
   (void)scan_ptr;
 
-  coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-  if (!skip_block) {
-    __m128i eob;
-    __m128i zbin;
-    __m128i round, quant, dequant, shift;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        __m128i pw_1;
-        zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        pw_1 = _mm_set1_epi16(1);
-        zbin = _mm_sub_epi16(zbin, pw_1);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-        shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-        // Do DC and first 15 AC
-        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
-        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        shift = _mm_unpackhi_epi64(shift, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
-        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-
-        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
-        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
-        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      store_coefficients(zero, dqcoeff_ptr + n_coeffs);
-      store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
-      store_coefficients(zero, qcoeff_ptr + n_coeffs);
-      store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
+  // Setup global values.
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+  shift = _mm_unpackhi_epi64(shift, shift);
+
+  calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+  // Reinsert signs
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+  // Mask out zbin threshold coeffs
+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+  store_coefficients(qcoeff0, qcoeff_ptr);
+  store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+  store_coefficients(coeff0, dqcoeff_ptr);
+  store_coefficients(coeff1, dqcoeff_ptr + 8);
+
+  eob =
+      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr + index);
+    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+
+    index += 16;
   }
+
+  *eob_ptr = accumulate_eob(eob);
 }
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
index e2c1ebb71..39d4ca674 100644
--- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -18,21 +18,18 @@ pw_1: times 8 dw 1
 
 SECTION .text
 
-; TODO(yunqingwang)fix quantize_b code for skip=1 case.
 %macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
                                 shift, qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
 
   ; actual quantize loop - setup pointers, rounders, etc.
   movifnidn                   coeffq, coeffmp
   movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
   movifnidn                    zbinq, zbinmp
   movifnidn                   roundq, roundmp
   movifnidn                   quantq, quantmp
+  movifnidn                 dequantq, dequantmp
   mova                            m0, [zbinq]              ; m0 = zbin
   mova                            m1, [roundq]             ; m1 = round
   mova                            m2, [quantq]             ; m2 = quant
@@ -44,18 +41,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
   psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
 %endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [GLOBAL(pw_1)]
+  mova                            m3, [dequantq]           ; m3 = dequant
   mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
+  psubw                           m0, [GLOBAL(pw_1)]
   mova                            m4, [r2]                 ; m4 = shift
+  mov                             r3, qcoeffmp
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
 %ifidn %1, b_32x32
   psllw                           m4, 1
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
   lea                         coeffq, [  coeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
@@ -268,33 +265,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pextrw                          r6, m8, 0
   mov                             [r2], r6
   RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-  mova       [dqcoeffq+ncoeffq*4+ 0], m7
-  mova       [dqcoeffq+ncoeffq*4+16], m7
-  mova       [dqcoeffq+ncoeffq*4+32], m7
-  mova       [dqcoeffq+ncoeffq*4+48], m7
-  mova        [qcoeffq+ncoeffq*4+ 0], m7
-  mova        [qcoeffq+ncoeffq*4+16], m7
-  mova        [qcoeffq+ncoeffq*4+32], m7
-  mova        [qcoeffq+ncoeffq*4+48], m7
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
 %endmacro
 
 INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h
new file mode 100644
index 000000000..4eed7dd29
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_x86.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom/aom_integer.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+                                 const int16_t *round_ptr, __m128i *round,
+                                 const int16_t *quant_ptr, __m128i *quant,
+                                 const int16_t *dequant_ptr, __m128i *dequant,
+                                 const int16_t *shift_ptr, __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  *round = _mm_load_si128((const __m128i *)round_ptr);
+  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi16(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+                                    const __m128i quant, const __m128i shift) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+  return _mm_mullo_epi16(qcoeff, dequant);
+}
+
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+                                   const __m128i zbin_mask0,
+                                   const __m128i zbin_mask1,
+                                   const int16_t *scan_ptr, const int index,
+                                   const __m128i zero) {
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+  __m128i eob0, eob1;
+  // Add one to convert from indices to counts
+  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+  eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+  eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+  return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c
new file mode 100644
index 000000000..305dde5c0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_avx2.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+                                const uint8_t *b) {
+  const __m256i v_a0 = yy_loadu_256(a);
+  const __m256i v_b0 = yy_loadu_256(b);
+  const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0));
+  const __m256i v_a01_w =
+      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1));
+  const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0));
+  const __m256i v_b01_w =
+      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1));
+  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+  int64_t sum;
+  const __m256i sum0_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all));
+  const __m256i sum1_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1));
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  __m256i sum = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        const __m128i v_a0 = xx_loadl_32(a);
+        const __m128i v_a1 = xx_loadl_32(a + a_stride);
+        const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
+        const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
+        const __m128i v_b0 = xx_loadl_32(b);
+        const __m128i v_b1 = xx_loadl_32(b + b_stride);
+        const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
+        const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
+        const __m128i v_a0123 = _mm_unpacklo_epi64(
+            _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3));
+        const __m128i v_b0123 = _mm_unpacklo_epi64(
+            _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3));
+        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        const __m128i v_a0 = xx_loadl_64(a);
+        const __m128i v_a1 = xx_loadl_64(a + a_stride);
+        const __m128i v_b0 = xx_loadl_64(b);
+        const __m128i v_b1 = xx_loadl_64(b + b_stride);
+        const __m256i v_a_w =
+            _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+        const __m256i v_b_w =
+            _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        const __m128i v_a0 = xx_loadu_128(a);
+        const __m128i v_b0 = xx_loadu_128(b);
+        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0);
+        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0);
+        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 64:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 128:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        sse_w32_avx2(&sum, a + 64, b + 64);
+        sse_w32_avx2(&sum, a + 96, b + 96);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    default: break;
+  }
+
+  return sse;
+}
+
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+                                       const uint16_t *b) {
+  const __m256i v_a_w = yy_loadu_256(a);
+  const __m256i v_b_w = yy_loadu_256(b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m256i sum = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        const __m128i v_a0 = xx_loadl_64(a);
+        const __m128i v_a1 = xx_loadl_64(a + a_stride);
+        const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
+        const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
+        const __m128i v_b0 = xx_loadl_64(b);
+        const __m128i v_b1 = xx_loadl_64(b + b_stride);
+        const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
+        const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
+        const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
+                                           _mm_unpacklo_epi64(v_a2, v_a3));
+        const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
+                                           _mm_unpacklo_epi64(v_b2, v_b3));
+        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
+        const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
+        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        highbd_sse_w16_avx2(&sum, a + 16, b + 16);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 64:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
+        highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
+        highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 128:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
+        highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
+        highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
+        highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4);
+        highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5);
+        highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6);
+        highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    default: break;
+  }
+  return sse;
+}
diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c
new file mode 100644
index 000000000..8b5af8469
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_sse4.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+  int64_t sum;
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+  const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+                                  const uint8_t *b) {
+  const __m128i v_a0 = xx_loadu_128(a);
+  const __m128i v_b0 = xx_loadu_128(b);
+  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  int y = 0;
+  int64_t sse = 0;
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        const __m128i v_a0 = xx_loadl_32(a);
+        const __m128i v_a1 = xx_loadl_32(a + a_stride);
+        const __m128i v_b0 = xx_loadl_32(b);
+        const __m128i v_b1 = xx_loadl_32(b + b_stride);
+        const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+        const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        const __m128i v_a0 = xx_loadl_64(a);
+        const __m128i v_b0 = xx_loadl_64(b);
+        const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+        const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 32:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16, b + 16);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 64:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 128:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
+        sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
+        sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
+        sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    default: break;
+  }
+
+  return sse;
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+                                        const uint16_t *b) {
+  const __m128i v_a_w = xx_loadu_128(a);
+  const __m128i v_b_w = xx_loadu_128(b);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int width,
+                              int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        const __m128i v_a0 = xx_loadl_64(a);
+        const __m128i v_a1 = xx_loadl_64(a + a_stride);
+        const __m128i v_b0 = xx_loadl_64(b);
+        const __m128i v_b1 = xx_loadl_64(b + b_stride);
+        const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+        const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        highbd_sse_w8_sse4_1(&sum, a + 8, b + 8);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 32:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 64:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 128:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    default: break;
+  }
+  return sse;
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
new file mode 100644
index 000000000..0af44e3a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
+                                                int width, int height) {
+  uint64_t result;
+  __m256i v_acc_q = _mm256_setzero_si256();
+  const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff);
+  for (int col = 0; col < height; col += 4) {
+    __m256i v_acc_d = _mm256_setzero_si256();
+    for (int row = 0; row < width; row += 16) {
+      const int16_t *tempsrc = src + row;
+      const __m256i v_val_0_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+      const __m256i v_val_1_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+      const __m256i v_val_2_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+      const __m256i v_val_3_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+      const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+
+      const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
+
+      v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
+    }
+    v_acc_q =
+        _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
+    v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
+    src += 4 * stride;
+  }
+  __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
+  __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
+  __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
+
+  result_64_2_int = _mm_add_epi64(
+      result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
+
+  xx_storel_64(&result, result_64_2_int);
+
+  return result;
+}
+
+uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
+                                     int height) {
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+  } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+  } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+    return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
+  } else {
+    return aom_sum_squares_2d_i16_c(src, stride, width, height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
index a79f22d79..22d7739ec 100644
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -14,6 +14,7 @@
 #include <stdio.h>
 
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
 #include "config/aom_dsp_rtcd.h"
 
 static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
@@ -44,8 +45,7 @@ static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
   return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
 }
 
-static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
-                                                int stride) {
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
   const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
   __m128i v_sum_d =
       _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
@@ -53,8 +53,8 @@ static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
   return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
 }
 
-static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
-                                                int height) {
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                         int height) {
   int r = 0;
   __m128i v_acc_q = _mm_setzero_si128();
   do {
@@ -76,7 +76,7 @@ static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
 // maintenance instructions in the common case of 4x4.
 __attribute__((noinline))
 #endif
-static uint64_t
+uint64_t
 aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
                                 int height) {
   int r = 0;
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
new file mode 100644
index 000000000..491e31cc5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+
+uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
+                                         int width, int height);
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                         int height);
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
+
+#endif  // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
index d9a53fcc5..1e9f1e27b 100644
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_SYNONYMS_H_
-#define AOM_DSP_X86_SYNONYMS_H_
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_H_
 
 #include <immintrin.h>
 
@@ -103,15 +103,6 @@ static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
   return _mm_srai_epi32(v_tmp_d, bits);
 }
 
-// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
-  const __m128i v_tmp_d =
-      _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
-  return _mm_srai_epi32(v_tmp_d, bits);
-}
-
 static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
   const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
   const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
@@ -120,4 +111,4 @@ static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
   return _mm_srai_epi16(v_tmp_d, bits);
 }
 
-#endif  // AOM_DSP_X86_SYNONYMS_H_
+#endif  // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
index 39f371fc9..3f69b120e 100644
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_
-#define AOM_DSP_X86_SYNONYMS_AVX2_H_
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
 
 #include <immintrin.h>
 
@@ -61,4 +61,14 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
 }
 
-#endif  // AOM_DSP_X86_SYNONYMS_AVX2_H_
+static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+  __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
+  __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
+  return yy_set_m128i(mhi, mlo);
+}
+
+static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
+  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
+}
+#endif  // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
index f88a1527d..d0d1ee684 100644
--- a/third_party/aom/aom_dsp/x86/transpose_sse2.h
+++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_
-#define AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
 
@@ -417,4 +417,4 @@ static INLINE void transpose_32bit_8x4(const __m128i *const in,
   out[7] = _mm_unpackhi_epi64(a6, a7);
 }
 
-#endif  // AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#endif  // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
index bdff64b8f..b1611ba87 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_
-#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
 
 #include <emmintrin.h>
 #include "aom/aom_integer.h"
@@ -196,4 +196,4 @@ static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
 }
 #endif
 
-#endif  // AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
index 58a792424..ed82eee96 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_
-#define AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
 
 #include <emmintrin.h>
 #include "aom/aom_integer.h"
@@ -26,4 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) {
   return _mm_shuffle_epi32(b, 0x4e);
 }
 
-#endif  // AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
index a7ac2c93d..800aef126 100644
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -433,13 +433,14 @@ static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
   return comp;
 }
 
-void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8,
+void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
                                     int width, int height, const uint8_t *ref8,
                                     int ref_stride, const uint8_t *mask,
                                     int mask_stride, int invert_mask) {
   int i = 0;
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
   const uint16_t *src0 = invert_mask ? pred : ref;
   const uint16_t *src1 = invert_mask ? ref : pred;
   const int stride0 = invert_mask ? width : ref_stride;
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index 7e3c5d5db..3c37e77c0 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -16,6 +16,7 @@
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
+#include "aom_dsp/blend.h"
 #include "aom_dsp/x86/synonyms.h"
 
 #include "aom_ports/mem.h"
@@ -485,7 +486,8 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
                              int mi_row, int mi_col, const MV *const mv,
                              uint8_t *comp_pred, int width, int height,
                              int subpel_x_q3, int subpel_y_q3,
-                             const uint8_t *ref, int ref_stride) {
+                             const uint8_t *ref, int ref_stride,
+                             int subpel_search) {
   // expect xd == NULL only in tests
   if (xd != NULL) {
     const MB_MODE_INFO *mi = xd->mi[0];
@@ -553,7 +555,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
       warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
 
       // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
       const InterpFilters filters =
           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
 
@@ -570,7 +572,10 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
   }
 
   const InterpFilterParams *filter =
-      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+      (subpel_search == 1)
+          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+  int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS;
 
   if (!subpel_x_q3 && !subpel_y_q3) {
     if (width >= 16) {
@@ -632,15 +637,25 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
     const int16_t *const kernel_y =
         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+    uint8_t *temp_start_horiz =
+        (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp;
+    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
-                        ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
-                        width, intermediate_height);
-    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
-                       MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
-                       width, height);
+    // TODO(Deepa): Remove the memset below when we have
+    // 4 tap simd for sse2 and ssse3.
+    if (subpel_search == 1) {
+      memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width);
+      memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width);
+      memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width);
+      memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width);
+    }
+    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+                        kernel_x, 16, NULL, -1, width, intermediate_height);
+    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+                       kernel_y, 16, width, height);
   }
 }
 
@@ -648,11 +663,11 @@ void aom_comp_avg_upsampled_pred_sse2(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride) {
+    int ref_stride, int subpel_search) {
   int n;
   int i;
   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
   /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
   assert(!(width * height & 15));
   n = width * height >> 4;
@@ -664,3 +679,128 @@ void aom_comp_avg_upsampled_pred_sse2(
     pred += 16;
   }
 }
+
+void aom_comp_mask_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+                     mask_stride, invert_mask);
+}
+
+static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
+                                                      const __m128i s1,
+                                                      const __m128i a) {
+  const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
+
+  const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
+  const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
+  const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
+                                        AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
+  const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
+  const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
+                                        AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
+
+  return comp;
+}
+
+void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  int i = 0;
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (width == 8) {
+    do {
+      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+      const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
+      const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
+
+      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
+
+      _mm_storeu_si128((__m128i *)comp_pred, comp);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+      const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
+      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+      const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
+
+      const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
+      const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+      const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+      const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+      _mm_storeu_si128((__m128i *)comp_pred, comp);
+      _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 32) {
+    do {
+      for (int j = 0; j < 2; j++) {
+        const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
+        const __m128i s2 =
+            _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
+        const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
+        const __m128i s3 =
+            _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
+
+        const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
+        const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+        const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+        const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+        const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+        _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+        _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+      }
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  }
+}