12 files changed, 1130 insertions, 29 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
index 5e627ebcf..e85c15eaf 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
@@ -676,11 +676,12 @@ void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
   __m128i verf[6];
   __m128i horf[2];
   SubpelFilterCoeffs hCoeffs, vCoeffs;
+  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
   const uint8_t *src_ptr;
-  store_pixel_t store2p = store2pixelTab[conv_params->ref];
-  store_pixel_t store4p = store4pixelTab[conv_params->ref];
-  transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->ref];
-  transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->ref];
+  store_pixel_t store2p = store2pixelTab[conv_params->do_average];
+  store_pixel_t store4p = store4pixelTab[conv_params->do_average];
+  transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->do_average];
+  transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->do_average];
 
   const int tapsNum = filter_params.taps;
   int block_height, block_residu;
@@ -890,10 +891,11 @@ void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
   __m128i verf[6];
   SubpelFilterCoeffs vCoeffs;
   const uint8_t *src_ptr;
+  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
   uint8_t *dst_ptr = dst;
-  store_pixel_t store2p = store2pixelTab[conv_params->ref];
-  store_pixel_t store4p = store4pixelTab[conv_params->ref];
-  store_pixel_t store8p = store8pixelTab[conv_params->ref];
+  store_pixel_t store2p = store2pixelTab[conv_params->do_average];
+  store_pixel_t store4p = store4pixelTab[conv_params->do_average];
+  store_pixel_t store8p = store8pixelTab[conv_params->do_average];
   const int tapsNum = filter_params.taps;
 
   if (0 == subpel_y_q4 || 16 != y_step_q4) {
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
index 1d7c55349..f7824b627 100644
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -40,7 +40,12 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
                                      const int stride,
                                      const TXFM_2D_FLIP_CFG *cfg,
                                      int32_t *txfm_buf) {
-  // TODO(sarahparker) must correct for rectangular transforms in follow up
+  // TODO(sarahparker) This does not currently support rectangular transforms
+  // and will break without splitting txfm_size out into row and col size.
+  // Rectangular transforms use c code only, so it should be ok for now.
+  // It will be corrected when there are sse implementations for rectangular
+  // transforms.
+  assert(cfg->row_cfg->txfm_size == cfg->col_cfg->txfm_size);
   const int txfm_size = cfg->row_cfg->txfm_size;
   const int8_t *shift = cfg->row_cfg->shift;
   const int8_t *stage_range_col = cfg->col_cfg->stage_range;
diff --git a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
index af7afb7ee..fd0a6ed2c 100644
--- a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
+++ b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
@@ -64,7 +64,7 @@ static INLINE void transpose_32_4x4(int stride, const __m128i *input,
 // the entire input block can be represent by a grid of 4x4 blocks
 // each 4x4 blocks can be represent by 4 vertical __m128i
 // we first transpose each 4x4 block internally
-// than transpose the grid
+// then transpose the grid
 static INLINE void transpose_32(int txfm_size, const __m128i *input,
                                 __m128i *output) {
   const int num_per_128 = 4;
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
new file mode 100644
index 000000000..46c2674ca
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+#if CONFIG_COMPOUND_ROUND
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
+                          CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
+                          InterpFilterParams *filter_params_x,
+                          InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
+                          ConvolveParams *conv_params) {
+  DECLARE_ALIGNED(16, uint8_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        // Filter even-index pixels
+        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        res = _mm_packus_epi16(res, res);
+        _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const uint8_t *data = &im_block[i * im_stride + j];
+        const __m128i src_01 = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 1 * im_stride)));
+        const __m128i src_23 = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 3 * im_stride)));
+        const __m128i src_45 = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 5 * im_stride)));
+        const __m128i src_67 = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 7 * im_stride)));
+
+        const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero);
+        const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero);
+        const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero);
+        const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero);
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero);
+        const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero);
+        const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero);
+        const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero);
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        // Accumulate values into the destination buffer
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
+        _mm_storeu_si128(p + 1,
+                         _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+      }
+    }
+  }
+}
+#else
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
+                          CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
+                          InterpFilterParams *filter_params_x,
+                          InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
+                          ConvolveParams *conv_params) {
+  const int bd = 8;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        // Filter even-index pixels
+        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        // Accumulate values into the destination buffer
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
+        _mm_storeu_si128(p + 1,
+                         _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+      }
+    }
+  }
+}
+#endif
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
new file mode 100644
index 000000000..a0e58716d
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "./av1_rtcd.h"
+
+#if CONFIG_CONVOLVE_ROUND
+static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+// 16 epi16 pixels
+static INLINE void pixel_clamp_avx2(__m256i *u, int bd) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+  __m256i clamped, mask;
+
+  mask = _mm256_cmpgt_epi16(*u, max);
+  clamped = _mm256_andnot_si256(mask, *u);
+  mask = _mm256_and_si256(mask, max);
+  clamped = _mm256_or_si256(mask, clamped);
+
+  const __m256i zero = _mm256_setzero_si256();
+  mask = _mm256_cmpgt_epi16(clamped, zero);
+  *u = _mm256_and_si256(clamped, mask);
+}
+
+// 8 epi16 pixels
+static INLINE void pixel_clamp_sse2(__m128i *u, int bd) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(*u, max);
+  clamped = _mm_andnot_si128(mask, *u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+
+  const __m128i zero = _mm_setzero_si128();
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  *u = _mm_and_si128(clamped, mask);
+}
+
+// Work on multiple of 32 pixels
+static INLINE void cal_rounding_32xn_avx2(const int32_t *src, uint8_t *dst,
+                                          const __m256i *rnd, int shift,
+                                          int num) {
+  do {
+    __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+    __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+    __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
+    __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
+
+    x0 = _mm256_add_epi32(x0, *rnd);
+    x1 = _mm256_add_epi32(x1, *rnd);
+    x2 = _mm256_add_epi32(x2, *rnd);
+    x3 = _mm256_add_epi32(x3, *rnd);
+
+    x0 = _mm256_srai_epi32(x0, shift);
+    x1 = _mm256_srai_epi32(x1, shift);
+    x2 = _mm256_srai_epi32(x2, shift);
+    x3 = _mm256_srai_epi32(x3, shift);
+
+    x0 = _mm256_packs_epi32(x0, x1);
+    x2 = _mm256_packs_epi32(x2, x3);
+
+    pixel_clamp_avx2(&x0, 8);
+    pixel_clamp_avx2(&x2, 8);
+
+    x0 = _mm256_packus_epi16(x0, x2);
+    x1 = _mm256_loadu_si256((const __m256i *)sindex);
+    x2 = _mm256_permutevar8x32_epi32(x0, x1);
+
+    _mm256_storeu_si256((__m256i *)dst, x2);
+    src += 32;
+    dst += 32;
+    num--;
+  } while (num > 0);
+}
+
+static INLINE void cal_rounding_16_avx2(const int32_t *src, uint8_t *dst,
+                                        const __m256i *rnd, int shift) {
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+
+  x0 = _mm256_add_epi32(x0, *rnd);
+  x1 = _mm256_add_epi32(x1, *rnd);
+
+  x0 = _mm256_srai_epi32(x0, shift);
+  x1 = _mm256_srai_epi32(x1, shift);
+
+  x0 = _mm256_packs_epi32(x0, x1);
+  pixel_clamp_avx2(&x0, 8);
+
+  const __m256i x2 = _mm256_packus_epi16(x0, x0);
+  x1 = _mm256_loadu_si256((const __m256i *)sindex);
+  x0 = _mm256_permutevar8x32_epi32(x2, x1);
+
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x0));
+}
+
+static INLINE void cal_rounding_8_avx2(const int32_t *src, uint8_t *dst,
+                                       const __m256i *rnd, int shift) {
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+  x0 = _mm256_add_epi32(x0, *rnd);
+  x0 = _mm256_srai_epi32(x0, shift);
+
+  x0 = _mm256_packs_epi32(x0, x0);
+  pixel_clamp_avx2(&x0, 8);
+
+  x0 = _mm256_packus_epi16(x0, x0);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)sindex);
+  x0 = _mm256_permutevar8x32_epi32(x0, x1);
+
+  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(x0));
+}
+
+static INLINE void cal_rounding_4_sse2(const int32_t *src, uint8_t *dst,
+                                       const __m128i *rnd, int shift) {
+  __m128i x = _mm_loadu_si128((const __m128i *)src);
+  x = _mm_add_epi32(x, *rnd);
+  x = _mm_srai_epi32(x, shift);
+
+  x = _mm_packs_epi32(x, x);
+  pixel_clamp_sse2(&x, 8);
+
+  x = _mm_packus_epi16(x, x);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(x);
+}
+
+void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                int bits) {
+  const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
+  const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
+
+  if (w > 64) {  // width = 128
+    do {
+      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 32) {  // width = 64
+    do {
+      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 16) {  // width = 32
+    do {
+      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 8) {  // width = 16
+    do {
+      cal_rounding_16_avx2(src, dst, &rnd_num, bits);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 4) {  // width = 8
+    do {
+      cal_rounding_8_avx2(src, dst, &rnd_num, bits);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 2) {  // width = 4
+    do {
+      cal_rounding_4_sse2(src, dst, &rnd_num_sse2, bits);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else {  // width = 2
+    do {
+      dst[0] = clip_pixel(ROUND_POWER_OF_TWO(src[0], bits));
+      dst[1] = clip_pixel(ROUND_POWER_OF_TWO(src[1], bits));
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE void cal_highbd_rounding_32xn_avx2(const int32_t *src,
+                                                 uint16_t *dst,
+                                                 const __m256i *rnd, int shift,
+                                                 int num, int bd) {
+  do {
+    __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+    __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+    __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
+    __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
+
+    x0 = _mm256_add_epi32(x0, *rnd);
+    x1 = _mm256_add_epi32(x1, *rnd);
+    x2 = _mm256_add_epi32(x2, *rnd);
+    x3 = _mm256_add_epi32(x3, *rnd);
+
+    x0 = _mm256_srai_epi32(x0, shift);
+    x1 = _mm256_srai_epi32(x1, shift);
+    x2 = _mm256_srai_epi32(x2, shift);
+    x3 = _mm256_srai_epi32(x3, shift);
+
+    x0 = _mm256_packs_epi32(x0, x1);
+    x2 = _mm256_packs_epi32(x2, x3);
+
+    pixel_clamp_avx2(&x0, bd);
+    pixel_clamp_avx2(&x2, bd);
+
+    x0 = _mm256_permute4x64_epi64(x0, 0xD8);
+    x2 = _mm256_permute4x64_epi64(x2, 0xD8);
+
+    _mm256_storeu_si256((__m256i *)dst, x0);
+    _mm256_storeu_si256((__m256i *)(dst + 16), x2);
+    src += 32;
+    dst += 32;
+    num--;
+  } while (num > 0);
+}
+
+static INLINE void cal_highbd_rounding_16_avx2(const int32_t *src,
+                                               uint16_t *dst,
+                                               const __m256i *rnd, int shift,
+                                               int bd) {
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+
+  x0 = _mm256_add_epi32(x0, *rnd);
+  x1 = _mm256_add_epi32(x1, *rnd);
+
+  x0 = _mm256_srai_epi32(x0, shift);
+  x1 = _mm256_srai_epi32(x1, shift);
+
+  x0 = _mm256_packs_epi32(x0, x1);
+  pixel_clamp_avx2(&x0, bd);
+
+  x0 = _mm256_permute4x64_epi64(x0, 0xD8);
+  _mm256_storeu_si256((__m256i *)dst, x0);
+}
+
+static INLINE void cal_highbd_rounding_8_avx2(const int32_t *src, uint16_t *dst,
+                                              const __m256i *rnd, int shift,
+                                              int bd) {
+  __m256i x = _mm256_loadu_si256((const __m256i *)src);
+  x = _mm256_add_epi32(x, *rnd);
+  x = _mm256_srai_epi32(x, shift);
+
+  x = _mm256_packs_epi32(x, x);
+  pixel_clamp_avx2(&x, bd);
+
+  x = _mm256_permute4x64_epi64(x, 0xD8);
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x));
+}
+
+static INLINE void cal_highbd_rounding_4_sse2(const int32_t *src, uint16_t *dst,
+                                              const __m128i *rnd, int shift,
+                                              int bd) {
+  __m128i x = _mm_loadu_si128((const __m128i *)src);
+  x = _mm_add_epi32(x, *rnd);
+  x = _mm_srai_epi32(x, shift);
+
+  x = _mm_packs_epi32(x, x);
+  pixel_clamp_sse2(&x, bd);
+  _mm_storel_epi64((__m128i *)dst, x);
+}
+
+void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride,
+                                       uint8_t *dst8, int dst_stride, int w,
+                                       int h, int bits, int bd) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
+  const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
+
+  if (w > 64) {  // width = 128
+    do {
+      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 32) {  // width = 64
+    do {
+      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 16) {  // width = 32
+    do {
+      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 8) {  // width = 16
+    do {
+      cal_highbd_rounding_16_avx2(src, dst, &rnd_num, bits, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 4) {  // width = 8
+    do {
+      cal_highbd_rounding_8_avx2(src, dst, &rnd_num, bits, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 2) {  // width = 4
+    do {
+      cal_highbd_rounding_4_sse2(src, dst, &rnd_num_sse2, bits, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else {  // width = 2
+    do {
+      dst[0] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[0], bits), bd);
+      dst[1] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[1], bits), bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_CONVOLVE_ROUND
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
new file mode 100644
index 000000000..ff4a0a0fe
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+#if CONFIG_COMPOUND_ROUND
+void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
+                                  CONV_BUF_TYPE *dst, int dst_stride, int w,
+                                  int h, InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+        const __m128i res_2 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+        const __m128i res_4 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+        const __m128i res_6 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i res_1 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+        const __m128i res_3 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+        const __m128i res_5 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+        const __m128i res_7 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        res = _mm_max_epi16(_mm_min_epi16(res, maxval), _mm_setzero_si128());
+        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        // Accumulate values into the destination buffer
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
+        _mm_storeu_si128(p + 1,
+                         _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+      }
+    }
+  }
+}
+#else
+void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
+                                  CONV_BUF_TYPE *dst, int dst_stride, int w,
+                                  int h, InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 15-bit intermediate array.
+  assert(conv_params->round_0 >= 5);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+        const __m128i res_2 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+        const __m128i res_4 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+        const __m128i res_6 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i res_1 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+        const __m128i res_3 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+        const __m128i res_5 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+        const __m128i res_7 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        // Accumulate values into the destination buffer
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
+        _mm_storeu_si128(p + 1,
+                         _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+      }
+    }
+  }
+}
+#endif
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
index 37e2f61e7..35d637f72 100644
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -19,8 +19,9 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
                                   uint16_t *pred, int p_col, int p_row,
                                   int p_width, int p_height, int p_stride,
                                   int subsampling_x, int subsampling_y, int bd,
-                                  int comp_avg, int16_t alpha, int16_t beta,
-                                  int16_t gamma, int16_t delta) {
+                                  ConvolveParams *conv_params, int16_t alpha,
+                                  int16_t beta, int16_t gamma, int16_t delta) {
+  int comp_avg = conv_params->do_average;
 #if HORSHEAR_REDUCE_PREC_BITS >= 5
   __m128i tmp[15];
 #else
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
index c69614e42..0648b95b3 100644
--- a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -364,8 +364,9 @@ static void iidtx16(__m256i *in) {
 #endif
 
 void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride, int tx_type) {
+                               int stride, const TxfmParam *txfm_param) {
   __m256i in[16];
+  int tx_type = txfm_param->tx_type;
 
   load_buffer_16x16(input, in);
   switch (tx_type) {
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
index d6a598746..bf12a26d3 100644
--- a/third_party/aom/av1/common/x86/idct_intrin_sse2.c
+++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
@@ -59,10 +59,11 @@ static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) {
 #endif
 
 void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            int tx_type) {
+                            const TxfmParam *txfm_param) {
   __m128i in[2];
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
+  int tx_type = txfm_param->tx_type;
 
   in[0] = load_input_data(input);
   in[1] = load_input_data(input + 8);
@@ -150,10 +151,11 @@ void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
 }
 
 void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            int tx_type) {
+                            const TxfmParam *txfm_param) {
   __m128i in[8];
   const __m128i zero = _mm_setzero_si128();
   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  int tx_type = txfm_param->tx_type;
 
   // load input data
   in[0] = load_input_data(input);
@@ -251,10 +253,11 @@ static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
 #endif  // CONFIG_EXT_TX
 
 void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, int tx_type) {
+                               int stride, const TxfmParam *txfm_param) {
   __m128i in[32];
   __m128i *in0 = &in[0];
   __m128i *in1 = &in[16];
+  int tx_type = txfm_param->tx_type;
 
   load_buffer_8x16(input, in0);
   input += 8;
@@ -388,8 +391,9 @@ static INLINE void flip_buffer_lr_8x8(__m128i *in) {
 #endif  // CONFIG_EXT_TX
 
 void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride, int tx_type) {
+                              int stride, const TxfmParam *txfm_param) {
   __m128i in[16];
+  int tx_type = txfm_param->tx_type;
 
   in[0] = load_input_data(input + 0 * 8);
   in[1] = load_input_data(input + 1 * 8);
@@ -553,8 +557,9 @@ static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
 }
 
 void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride, int tx_type) {
+                              int stride, const TxfmParam *txfm_param) {
   __m128i in[16];
+  int tx_type = txfm_param->tx_type;
 
   // Transpose 16x8 input into in[]
   in[0] = load_input_data(input + 0 * 16);
@@ -713,8 +718,9 @@ static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
 }
 
 void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            int tx_type) {
+                            const TxfmParam *txfm_param) {
   __m128i in[8];
+  int tx_type = txfm_param->tx_type;
 
   in[0] = load_input_data(input + 0 * 8);
   in[1] = load_input_data(input + 1 * 8);
@@ -897,8 +903,9 @@ static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
 }
 
 void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            int tx_type) {
+                            const TxfmParam *txfm_param) {
   __m128i in[8];
+  int tx_type = txfm_param->tx_type;
 
   // Load rows, packed two per element of 'in'.
   // We pack into the bottom half of 'in' so that the
@@ -1119,8 +1126,9 @@ static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
 }
 
 void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, int tx_type) {
+                               int stride, const TxfmParam *txfm_param) {
   __m128i intl[16], intr[16], inbl[16], inbr[16];
+  int tx_type = txfm_param->tx_type;
 
   int i;
   for (i = 0; i < 16; ++i) {
@@ -1272,8 +1280,9 @@ static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
 }
 
 void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, int tx_type) {
+                               int stride, const TxfmParam *txfm_param) {
   __m128i in0[16], in1[16], in2[16], in3[16];
+  int tx_type = txfm_param->tx_type;
   int i;
 
   for (i = 0; i < 16; ++i) {
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index 260faa8c9..e2e4f51c3 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -16,8 +16,8 @@ static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
   if (bit_depth > 8) {
     __m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
     __m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
-    __m128i shift_a = _mm_set_epi64x(0, 2 * (bit_depth - 8));
-    __m128i shift_b = _mm_set_epi64x(0, bit_depth - 8);
+    __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+    __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
     a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a);
     b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b);
     a = _mm_mullo_epi32(a, n);
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
index cdc4e8d0f..5a22d9abf 100644
--- a/third_party/aom/av1/common/x86/warp_plane_sse2.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse2.c
@@ -17,9 +17,10 @@
 void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
                           int height, int stride, uint8_t *pred, int p_col,
                           int p_row, int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y, int comp_avg,
-                          int16_t alpha, int16_t beta, int16_t gamma,
-                          int16_t delta) {
+                          int subsampling_x, int subsampling_y,
+                          ConvolveParams *conv_params, int16_t alpha,
+                          int16_t beta, int16_t gamma, int16_t delta) {
+  int comp_avg = conv_params->do_average;
   __m128i tmp[15];
   int i, j, k;
   const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/warp_plane_ssse3.c b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
index 494410e99..f8e6f62ba 100644
--- a/third_party/aom/av1/common/x86/warp_plane_ssse3.c
+++ b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
@@ -204,9 +204,10 @@ static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
 void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
                            int height, int stride, uint8_t *pred, int p_col,
                            int p_row, int p_width, int p_height, int p_stride,
-                           int subsampling_x, int subsampling_y, int comp_avg,
-                           int16_t alpha, int16_t beta, int16_t gamma,
-                           int16_t delta) {
+                           int subsampling_x, int subsampling_y,
+                           ConvolveParams *conv_params, int16_t alpha,
+                           int16_t beta, int16_t gamma, int16_t delta) {
+  int comp_avg = conv_params->do_average;
   __m128i tmp[15];
   int i, j, k;
   const int bd = 8;