13 files changed, 1371 insertions, 296 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
new file mode 100644
index 000000000..1f0fedb2a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -0,0 +1,645 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+// Make a mask for coefficients of 10/12 tap filters. The coefficients are
+// packed "89ab89ab". If it's a 12-tap filter, we want all 1's; if it's a
+// 10-tap filter, we want "11001100" to just match the 8,9 terms.
+static __m128i make_1012_mask(int ntaps) {
+  uint32_t low = 0xffffffff;
+  uint32_t high = (ntaps == 12) ? low : 0;
+  return _mm_set_epi32(high, low, high, low);
+}
+
+// Zero-extend the given input operand to an entire __m128i register.
+//
+// Note that there's almost an intrinsic to do this but 32-bit Visual Studio
+// doesn't have _mm_set_epi64x so we have to do it by hand.
+static __m128i extend_32_to_128(uint32_t x) {
+  return _mm_set_epi32(0, 0, 0, x);
+}
+
+// Load an SSE register from p and bitwise AND with a.
+static __m128i load_and_128i(const void *p, __m128i a) {
+  const __m128d ad = _mm_castsi128_pd(a);
+  const __m128d bd = _mm_load1_pd((const double *)p);
+  return _mm_castpd_si128(_mm_and_pd(ad, bd));
+}
+
+// The horizontal filter for av1_convolve_2d_scale_sse4_1. This is the more
+// general version, supporting 10 and 12 tap filters. For 8-tap filters, use
+// hfilter8.
+static void hfilter(const uint8_t *src, int src_stride, int32_t *dst, int w,
+                    int h, int subpel_x_qn, int x_step_qn,
+                    const InterpFilterParams *filter_params, unsigned round) {
+  const int bd = 8;
+  const int ntaps = filter_params->taps;
+  assert(ntaps == 10 || ntaps == 12);
+
+  src -= ntaps / 2 - 1;
+
+  // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
+  // out the unneeded entries.
+  const __m128i hicoeff_mask = make_1012_mask(ntaps);
+
+  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+  const __m128i round_add = _mm_set1_epi32(round_add32);
+  const __m128i round_shift = extend_32_to_128(round);
+
+  int x_qn = subpel_x_qn;
+  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+    const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+
+    // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
+    // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
+    // are masked out with hicoeff_mask.
+    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+    const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
+    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
+
+    int y;
+    for (y = 0; y <= h - 4; y += 4) {
+      const uint8_t *const src0 = src_col + y * src_stride;
+      const uint8_t *const src1 = src0 + 1 * src_stride;
+      const uint8_t *const src2 = src0 + 2 * src_stride;
+      const uint8_t *const src3 = src0 + 3 * src_stride;
+
+      // Load up source data. This is 8-bit input data, so each load gets 16
+      // pixels (we need at most 12)
+      const __m128i data08 = _mm_loadu_si128((__m128i *)src0);
+      const __m128i data18 = _mm_loadu_si128((__m128i *)src1);
+      const __m128i data28 = _mm_loadu_si128((__m128i *)src2);
+      const __m128i data38 = _mm_loadu_si128((__m128i *)src3);
+
+      // Now zero-extend up to 16-bit precision by interleaving with zeros. For
+      // the "high" pixels (8 to 11), interleave first (so that the expansion
+      // to 16-bits operates on an entire register).
+      const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
+      const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
+      const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
+      const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
+      const __m128i data01hi8 = _mm_unpackhi_epi32(data08, data18);
+      const __m128i data23hi8 = _mm_unpackhi_epi32(data28, data38);
+      const __m128i data01hi = _mm_unpacklo_epi8(data01hi8, zero);
+      const __m128i data23hi = _mm_unpacklo_epi8(data23hi8, zero);
+
+      // Multiply by coefficients
+      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+      const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
+      const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
+
+      // Reduce horizontally and add
+      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+      const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
+      const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
+      const __m128i conv = _mm_add_epi32(convlo, convhi);
+
+      // Divide down by (1 << round), rounding to nearest.
+      const __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+      // Write transposed to the output
+      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+    }
+    for (; y < h; ++y) {
+      const uint8_t *const src_row = src_col + y * src_stride;
+
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < ntaps; ++k) {
+        sum += filter[k] * src_row[k];
+      }
+
+      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+    }
+  }
+}
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
+                     int h, int subpel_x_qn, int x_step_qn,
+                     const InterpFilterParams *filter_params, unsigned round) {
+  const int bd = 8;
+  const int ntaps = 8;
+
+  src -= ntaps / 2 - 1;
+
+  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+  const __m128i round_add = _mm_set1_epi32(round_add32);
+  const __m128i round_shift = extend_32_to_128(round);
+
+  int x_qn = subpel_x_qn;
+  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+    const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+
+    // Load the filter coefficients
+    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
+
+    int y;
+    for (y = 0; y <= h - 4; y += 4) {
+      const uint8_t *const src0 = src_col + y * src_stride;
+      const uint8_t *const src1 = src0 + 1 * src_stride;
+      const uint8_t *const src2 = src0 + 2 * src_stride;
+      const uint8_t *const src3 = src0 + 3 * src_stride;
+
+      // Load up source data. This is 8-bit input data; each load is just
+      // loading the lower half of the register and gets 8 pixels
+      const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
+      const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
+      const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
+      const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);
+
+      // Now zero-extend up to 16-bit precision by interleaving with
+      // zeros. Drop the upper half of each register (which just had zeros)
+      const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
+      const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
+      const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
+      const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
+
+      // Multiply by coefficients
+      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+      // Reduce horizontally and add
+      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+      // Divide down by (1 << round), rounding to nearest.
+      const __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+      // Write transposed to the output
+      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+    }
+    for (; y < h; ++y) {
+      const uint8_t *const src_row = src_col + y * src_stride;
+
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < ntaps; ++k) {
+        sum += filter[k] * src_row[k];
+      }
+
+      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+    }
+  }
+}
+
+// Do a 12-tap convolution with the given coefficients, loading data from src.
+static __m128i convolve_32(const int32_t *src, __m128i coeff03, __m128i coeff47,
+                           __m128i coeff8d) {
+  const __m128i data03 = _mm_loadu_si128((__m128i *)src);
+  const __m128i data47 = _mm_loadu_si128((__m128i *)(src + 4));
+  const __m128i data8d = _mm_loadu_si128((__m128i *)(src + 8));
+  const __m128i conv03 = _mm_mullo_epi32(data03, coeff03);
+  const __m128i conv47 = _mm_mullo_epi32(data47, coeff47);
+  const __m128i conv8d = _mm_mullo_epi32(data8d, coeff8d);
+  return _mm_add_epi32(_mm_add_epi32(conv03, conv47), conv8d);
+}
+
+// Do an 8-tap convolution with the given coefficients, loading data from src.
+static __m128i convolve_32_8(const int32_t *src, __m128i coeff03,
+                             __m128i coeff47) {
+  const __m128i data03 = _mm_loadu_si128((__m128i *)src);
+  const __m128i data47 = _mm_loadu_si128((__m128i *)(src + 4));
+  const __m128i conv03 = _mm_mullo_epi32(data03, coeff03);
+  const __m128i conv47 = _mm_mullo_epi32(data47, coeff47);
+  return _mm_add_epi32(conv03, conv47);
+}
+
+// The vertical filter for av1_convolve_2d_scale_sse4_1. This is the more
+// general version, supporting 10 and 12 tap filters. For 8-tap filters, use
+// vfilter8.
+static void vfilter(const int32_t *src, int src_stride, int32_t *dst,
+                    int dst_stride, int w, int h, int subpel_y_qn,
+                    int y_step_qn, const InterpFilterParams *filter_params,
+                    const ConvolveParams *conv_params, int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int ntaps = filter_params->taps;
+
+  // Construct a mask with which we'll AND filter coefficients 89ab to zero out
+  // the unneeded entries. The upper bits of this mask are unused.
+  const __m128i hicoeff_mask = make_1012_mask(ntaps);
+
+  int32_t round_add32 = (1 << conv_params->round_1) / 2 + (1 << offset_bits);
+  const __m128i round_add = _mm_set1_epi32(round_add32);
+  const __m128i round_shift = extend_32_to_128(conv_params->round_1);
+
+  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+                         (1 << (offset_bits - conv_params->round_1 - 1)));
+  const __m128i sub = _mm_set1_epi32(sub32);
+
+  int y_qn = subpel_y_qn;
+  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+    const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+
+    // Load up coefficients for the filter and sign-extend to 32-bit precision
+    // (to do so, calculate sign bits and then interleave)
+    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
+    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+    const __m128i coeffhi16 = load_and_128i(filter + 8, hicoeff_mask);
+    const __m128i csign0716 = _mm_cmplt_epi16(coeff0716, zero);
+    const __m128i csignhi16 = _mm_cmplt_epi16(coeffhi16, zero);
+    const __m128i coeff03 = _mm_unpacklo_epi16(coeff0716, csign0716);
+    const __m128i coeff47 = _mm_unpackhi_epi16(coeff0716, csign0716);
+    const __m128i coeff8d = _mm_unpacklo_epi16(coeffhi16, csignhi16);
+
+    int x;
+    for (x = 0; x <= w - 4; x += 4) {
+      const int32_t *const src0 = src_y + x * src_stride;
+      const int32_t *const src1 = src0 + 1 * src_stride;
+      const int32_t *const src2 = src0 + 2 * src_stride;
+      const int32_t *const src3 = src0 + 3 * src_stride;
+
+      // Load the source data for the three rows, adding the three registers of
+      // convolved products to one as we go (conv0..conv3) to avoid the
+      // register pressure getting too high.
+      const __m128i conv0 = convolve_32(src0, coeff03, coeff47, coeff8d);
+      const __m128i conv1 = convolve_32(src1, coeff03, coeff47, coeff8d);
+      const __m128i conv2 = convolve_32(src2, coeff03, coeff47, coeff8d);
+      const __m128i conv3 = convolve_32(src3, coeff03, coeff47, coeff8d);
+
+      // Now reduce horizontally to get one lane for each result
+      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+      const __m128i conv = _mm_hadd_epi32(conv01, conv23);
+
+      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+      const __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+      const __m128i subbed = _mm_sub_epi32(shifted, sub);
+
+      int32_t *dst_x = dst + y * dst_stride + x;
+      const __m128i result =
+          (conv_params->do_average)
+              ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
+              : subbed;
+
+      _mm_storeu_si128((__m128i *)dst_x, result);
+    }
+    for (; x < w; ++x) {
+      const int32_t *src_x = src_y + x * src_stride;
+      CONV_BUF_TYPE sum = 1 << offset_bits;
+      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
+      if (conv_params->do_average)
+        dst[y * dst_stride + x] += res;
+      else
+        dst[y * dst_stride + x] = res;
+    }
+  }
+}
+
+// A specialised version of vfilter, the vertical filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void vfilter8(const int32_t *src, int src_stride, int32_t *dst,
+                     int dst_stride, int w, int h, int subpel_y_qn,
+                     int y_step_qn, const InterpFilterParams *filter_params,
+                     const ConvolveParams *conv_params, int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int ntaps = 8;
+
+  int32_t round_add32 = (1 << conv_params->round_1) / 2 + (1 << offset_bits);
+  const __m128i round_add = _mm_set1_epi32(round_add32);
+  const __m128i round_shift = extend_32_to_128(conv_params->round_1);
+
+  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+                         (1 << (offset_bits - conv_params->round_1 - 1)));
+  const __m128i sub = _mm_set1_epi32(sub32);
+
+  int y_qn = subpel_y_qn;
+  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+    const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+
+    // Load up coefficients for the filter and sign-extend to 32-bit precision
+    // (to do so, calculate sign bits and then interleave)
+    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
+    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+    const __m128i csign0716 = _mm_cmplt_epi16(coeff0716, zero);
+    const __m128i coeff03 = _mm_unpacklo_epi16(coeff0716, csign0716);
+    const __m128i coeff47 = _mm_unpackhi_epi16(coeff0716, csign0716);
+
+    int x;
+    for (x = 0; x <= w - 4; x += 4) {
+      const int32_t *const src0 = src_y + x * src_stride;
+      const int32_t *const src1 = src0 + 1 * src_stride;
+      const int32_t *const src2 = src0 + 2 * src_stride;
+      const int32_t *const src3 = src0 + 3 * src_stride;
+
+      // Load the source data for the three rows, adding the three registers of
+      // convolved products to one as we go (conv0..conv3) to avoid the
+      // register pressure getting too high.
+      const __m128i conv0 = convolve_32_8(src0, coeff03, coeff47);
+      const __m128i conv1 = convolve_32_8(src1, coeff03, coeff47);
+      const __m128i conv2 = convolve_32_8(src2, coeff03, coeff47);
+      const __m128i conv3 = convolve_32_8(src3, coeff03, coeff47);
+
+      // Now reduce horizontally to get one lane for each result
+      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+      const __m128i conv = _mm_hadd_epi32(conv01, conv23);
+
+      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+      const __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+      const __m128i subbed = _mm_sub_epi32(shifted, sub);
+
+      int32_t *dst_x = dst + y * dst_stride + x;
+      const __m128i result =
+          (conv_params->do_average)
+              ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
+              : subbed;
+
+      _mm_storeu_si128((__m128i *)dst_x, result);
+    }
+    for (; x < w; ++x) {
+      const int32_t *src_x = src_y + x * src_stride;
+      CONV_BUF_TYPE sum = 1 << offset_bits;
+      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
+      if (conv_params->do_average)
+        dst[y * dst_stride + x] += res;
+      else
+        dst[y * dst_stride + x] = res;
+    }
+  }
+}
+
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
+                                  CONV_BUF_TYPE *dst, int dst_stride, int w,
+                                  int h, InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int x_step_qn,
+                                  const int subpel_y_qn, const int y_step_qn,
+                                  ConvolveParams *conv_params) {
+  int32_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+
+  const int xtaps = filter_params_x->taps;
+  const int ytaps = filter_params_y->taps;
+
+  const int fo_vert = ytaps / 2 - 1;
+
+  // horizontal filter
+  if (xtaps == 8)
+    hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
+             x_step_qn, filter_params_x, conv_params->round_0);
+  else
+    hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
+            x_step_qn, filter_params_x, conv_params->round_0);
+
+  // vertical filter (input is transposed)
+  if (ytaps == 8)
+    vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+             filter_params_y, conv_params, 8);
+  else
+    vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+            filter_params_y, conv_params, 8);
+}
+
+#if CONFIG_HIGHBITDEPTH
+// An wrapper to generate the SHUFPD instruction with __m128i types (just
+// writing _mm_shuffle_pd at the callsites gets a bit ugly because of the
+// casts)
+static __m128i mm_shuffle0_si128(__m128i a, __m128i b) {
+  __m128d ad = _mm_castsi128_pd(a);
+  __m128d bd = _mm_castsi128_pd(b);
+  return _mm_castpd_si128(_mm_shuffle_pd(ad, bd, 0));
+}
+
+// The horizontal filter for av1_highbd_convolve_2d_scale_sse4_1. This
+// is the more general version, supporting 10 and 12 tap filters. For
+// 8-tap filters, use hfilter8.
+static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
+                           int w, int h, int subpel_x_qn, int x_step_qn,
+                           const InterpFilterParams *filter_params,
+                           unsigned round, int bd) {
+  const int ntaps = filter_params->taps;
+  assert(ntaps == 10 || ntaps == 12);
+
+  src -= ntaps / 2 - 1;
+
+  // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
+  // out the unneeded entries.
+  const __m128i hicoeff_mask = make_1012_mask(ntaps);
+
+  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+  const __m128i round_add = _mm_set1_epi32(round_add32);
+  const __m128i round_shift = extend_32_to_128(round);
+
+  int x_qn = subpel_x_qn;
+  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+    const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+
+    // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
+    // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
+    // are masked out with hicoeff_mask.
+    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+    const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
+
+    int y;
+    for (y = 0; y <= h - 4; y += 4) {
+      const uint16_t *const src0 = src_col + y * src_stride;
+      const uint16_t *const src1 = src0 + 1 * src_stride;
+      const uint16_t *const src2 = src0 + 2 * src_stride;
+      const uint16_t *const src3 = src0 + 3 * src_stride;
+
+      // Load up source data. This is 16-bit input data, so each load gets 8
+      // pixels (we need at most 12)
+      const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
+      const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
+      const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
+      const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
+      const __m128i data0hi = _mm_loadu_si128((__m128i *)(src0 + 8));
+      const __m128i data1hi = _mm_loadu_si128((__m128i *)(src1 + 8));
+      const __m128i data2hi = _mm_loadu_si128((__m128i *)(src2 + 8));
+      const __m128i data3hi = _mm_loadu_si128((__m128i *)(src3 + 8));
+
+      // The "hi" data has rubbish in the top half so interleave pairs together
+      // to minimise the calculation we need to do.
+      const __m128i data01hi = mm_shuffle0_si128(data0hi, data1hi);
+      const __m128i data23hi = mm_shuffle0_si128(data2hi, data3hi);
+
+      // Multiply by coefficients
+      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+      const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
+      const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
+
+      // Reduce horizontally and add
+      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+      const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
+      const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
+      const __m128i conv = _mm_add_epi32(convlo, convhi);
+
+      // Divide down by (1 << round), rounding to nearest.
+      const __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+      // Write transposed to the output
+      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+    }
+    for (; y < h; ++y) {
+      const uint16_t *const src_row = src_col + y * src_stride;
+
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < ntaps; ++k) {
+        sum += filter[k] * src_row[k];
+      }
+
+      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+    }
+  }
+}
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_hfilter8(const uint16_t *src, int src_stride, int32_t *dst,
+                            int w, int h, int subpel_x_qn, int x_step_qn,
+                            const InterpFilterParams *filter_params,
+                            unsigned round, int bd) {
+  const int ntaps = 8;
+
+  src -= ntaps / 2 - 1;
+
+  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+  const __m128i round_add = _mm_set1_epi32(round_add32);
+  const __m128i round_shift = extend_32_to_128(round);
+
+  int x_qn = subpel_x_qn;
+  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+    const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+
+    // Load the filter coefficients
+    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+
+    int y;
+    for (y = 0; y <= h - 4; y += 4) {
+      const uint16_t *const src0 = src_col + y * src_stride;
+      const uint16_t *const src1 = src0 + 1 * src_stride;
+      const uint16_t *const src2 = src0 + 2 * src_stride;
+      const uint16_t *const src3 = src0 + 3 * src_stride;
+
+      // Load up source data. This is 16-bit input data, so each load gets the 8
+      // pixels we need.
+      const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
+      const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
+      const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
+      const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
+
+      // Multiply by coefficients
+      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+      // Reduce horizontally and add
+      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+      // Divide down by (1 << round), rounding to nearest.
+      const __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+      // Write transposed to the output
+      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+    }
+    for (; y < h; ++y) {
+      const uint16_t *const src_row = src_col + y * src_stride;
+
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < ntaps; ++k) {
+        sum += filter[k] * src_row[k];
+      }
+
+      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+    }
+  }
+}
+
+void av1_highbd_convolve_2d_scale_sse4_1(
+    const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
+    int w, int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+    ConvolveParams *conv_params, int bd) {
+  int32_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+
+  const int xtaps = filter_params_x->taps;
+  const int ytaps = filter_params_y->taps;
+  const int fo_vert = ytaps / 2 - 1;
+
+  // horizontal filter
+  if (xtaps == 8)
+    highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+                    subpel_x_qn, x_step_qn, filter_params_x,
+                    conv_params->round_0, bd);
+  else
+    highbd_hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+                   subpel_x_qn, x_step_qn, filter_params_x,
+                   conv_params->round_0, bd);
+
+  // vertical filter (input is transposed)
+  if (ytaps == 8)
+    vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+             filter_params_y, conv_params, bd);
+  else
+    vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+            filter_params_y, conv_params, bd);
+}
+#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
index f7824b627..58ede028a 100644
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -74,17 +74,9 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
 }
 
 void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
-                                 int stride, int tx_type, int bd) {
+                                 int stride, TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
   TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
   (void)bd;
   fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
 }
-
-void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
-                                 int stride, int tx_type, int bd) {
-  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
-  (void)bd;
-  fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
-}
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index 46c2674ca..e4d352c0e 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -31,6 +31,7 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
   const __m128i zero = _mm_setzero_si128();
@@ -181,9 +182,15 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
 
         // Accumulate values into the destination buffer
         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
-        _mm_storeu_si128(p + 1,
-                         _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+        if (do_average) {
+          _mm_storeu_si128(p + 0,
+                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
+          _mm_storeu_si128(p + 1,
+                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+        } else {
+          _mm_storeu_si128(p + 0, res_lo_round);
+          _mm_storeu_si128(p + 1, res_hi_round);
+        }
       }
     }
   }
@@ -204,6 +211,7 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
   const __m128i zero = _mm_setzero_si128();
@@ -357,9 +365,15 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
 
         // Accumulate values into the destination buffer
         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
-        _mm_storeu_si128(p + 1,
-                         _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+        if (do_average) {
+          _mm_storeu_si128(p + 0,
+                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
+          _mm_storeu_si128(p + 1,
+                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+        } else {
+          _mm_storeu_si128(p + 0, res_lo_round);
+          _mm_storeu_si128(p + 1, res_hi_round);
+        }
       }
     }
   }
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index ff4a0a0fe..195f0f570 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -32,6 +32,7 @@ void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
   const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
   /* Horizontal filter */
@@ -185,9 +186,15 @@ void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
 
         // Accumulate values into the destination buffer
         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
-        _mm_storeu_si128(p + 1,
-                         _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+        if (do_average) {
+          _mm_storeu_si128(p + 0,
+                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
+          _mm_storeu_si128(p + 1,
+                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+        } else {
+          _mm_storeu_si128(p + 0, res_lo_round);
+          _mm_storeu_si128(p + 1, res_hi_round);
+        }
       }
     }
   }
@@ -204,6 +211,7 @@ void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = MAX_SB_SIZE;
   int i, j;
+  const int do_average = conv_params->do_average;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
@@ -362,9 +370,15 @@ void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
 
         // Accumulate values into the destination buffer
         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
-        _mm_storeu_si128(p + 1,
-                         _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+        if (do_average) {
+          _mm_storeu_si128(p + 0,
+                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
+          _mm_storeu_si128(p + 1,
+                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+        } else {
+          _mm_storeu_si128(p + 0, res_lo_round);
+          _mm_storeu_si128(p + 1, res_hi_round);
+        }
       }
     }
   }
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
index dd2a681bc..0e833e6d9 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -599,7 +599,7 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
 }
 
 void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
-                                   int stride, int tx_type, int bd) {
+                                   int stride, TX_TYPE tx_type, int bd) {
   __m256i in[128], out[128];
   const TXFM_1D_CFG *row_cfg = NULL;
   const TXFM_1D_CFG *col_cfg = NULL;
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
index a93699f0b..8613bed86 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -230,7 +230,7 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
 }
 
 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
-                                   int stride, int tx_type, int bd) {
+                                   int stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
   const TXFM_1D_CFG *row_cfg = NULL;
   const TXFM_1D_CFG *col_cfg = NULL;
@@ -706,7 +706,7 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
 }
 
 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
-                                   int stride, int tx_type, int bd) {
+                                   int stride, TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
   const TXFM_1D_CFG *row_cfg = NULL;
   const TXFM_1D_CFG *col_cfg = NULL;
@@ -1316,7 +1316,7 @@ static void round_shift_16x16(__m128i *in, int shift) {
 }
 
 void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
-                                     int stride, int tx_type, int bd) {
+                                     int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64], out[64];
   const TXFM_1D_CFG *row_cfg = NULL;
   const TXFM_1D_CFG *col_cfg = NULL;
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
index 35d637f72..71b0ec7a3 100644
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -28,6 +28,20 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
 #error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
 #endif
   int i, j, k;
+#if CONFIG_CONVOLVE_ROUND
+  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+  const int reduce_bits_horiz =
+      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+  const int offset_bits_horiz =
+      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
+  if (use_conv_params) {
+    conv_params->do_post_rounding = 1;
+  }
+  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
+#else
+  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
+  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
+#endif
 
   /* Note: For this code to work, the left/right frame borders need to be
      extended by at least 13 pixels each. By the time we get here, other
@@ -43,30 +57,17 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
 
   for (i = 0; i < p_height; i += 8) {
     for (j = 0; j < p_width; j += 8) {
-      // (x, y) coordinates of the center of this block in the destination
-      // image
-      const int32_t dst_x = p_col + j + 4;
-      const int32_t dst_y = p_row + i + 4;
-
-      int32_t x4, y4, ix4, sx4, iy4, sy4;
-      if (subsampling_x)
-        x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
-              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
-
-      if (subsampling_y)
-        y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
-              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
-
-      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       // Add in all the constant terms, including rounding and offset
       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
@@ -154,9 +155,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
           // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
           const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
-          const __m128i round_const =
-              _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
-                             ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
+          const __m128i round_const = _mm_set1_epi32(
+              (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
 
           // Calculate filtered results
           const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
@@ -169,8 +169,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
 
           __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                            _mm_add_epi32(res_2, res_6));
-          res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                    HORSHEAR_REDUCE_PREC_BITS);
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+                                   _mm_cvtsi32_si128(reduce_bits_horiz));
 
           // Filter odd-index pixels
           const __m128i tmp_1 = _mm_loadu_si128(
@@ -207,8 +207,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
 
           __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                           _mm_add_epi32(res_3, res_7));
-          res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                   HORSHEAR_REDUCE_PREC_BITS);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
+                                  _mm_cvtsi32_si128(reduce_bits_horiz));
 
           // Combine results into one register.
           // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
@@ -299,39 +299,65 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
                                               _mm_add_epi32(res_5, res_7));
 
         // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        // Round and pack into 8 bits
-        const __m128i round_const =
-            _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                           ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
-
-        const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
-        const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
-
-        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-        // Clamp res_16bit to the range [0, 2^bd - 1]
-        const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
-        const __m128i zero = _mm_setzero_si128();
-        res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
-
-        // Store, blending with 'pred' if needed
-        __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-        // Note: If we're outputting a 4x4 block, we need to be very careful
-        // to only output 4 pixels at this point, to avoid encode/decode
-        // mismatches when encoding with multiple threads.
-        if (p_width == 4) {
-          if (comp_avg)
-            res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
-          _mm_storel_epi64(p, res_16bit);
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+#if CONFIG_CONVOLVE_ROUND
+        if (use_conv_params) {
+          __m128i *const p =
+              (__m128i *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+          const __m128i round_const = _mm_set1_epi32(
+              -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
+              ((1 << (conv_params->round_1)) >> 1));
+          res_lo = _mm_add_epi32(res_lo, round_const);
+          res_lo =
+              _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
+          if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
+          _mm_storeu_si128(p, res_lo);
+          if (p_width > 4) {
+            res_hi = _mm_add_epi32(res_hi, round_const);
+            res_hi =
+                _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
+            if (comp_avg)
+              res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
+            _mm_storeu_si128(p + 1, res_hi);
+          }
         } else {
-          if (comp_avg)
-            res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
-          _mm_storeu_si128(p, res_16bit);
+#else
+        {
+#endif
+          // Round and pack into 8 bits
+          const __m128i round_const =
+              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+
+          const __m128i res_lo_round = _mm_srai_epi32(
+              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+          const __m128i res_hi_round = _mm_srai_epi32(
+              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+          __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+          // Clamp res_16bit to the range [0, 2^bd - 1]
+          const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+          const __m128i zero = _mm_setzero_si128();
+          res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+          // Store, blending with 'pred' if needed
+          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+          // Note: If we're outputting a 4x4 block, we need to be very careful
+          // to only output 4 pixels at this point, to avoid encode/decode
+          // mismatches when encoding with multiple threads.
+          if (p_width == 4) {
+            if (comp_avg)
+              res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
+            _mm_storel_epi64(p, res_16bit);
+          } else {
+            if (comp_avg)
+              res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
+            _mm_storeu_si128(p, res_16bit);
+          }
         }
       }
     }
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
index 0648b95b3..c440d0f88 100644
--- a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -366,7 +366,7 @@ static void iidtx16(__m256i *in) {
 void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
                                int stride, const TxfmParam *txfm_param) {
   __m256i in[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 
   load_buffer_16x16(input, in);
   switch (tx_type) {
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
index bf12a26d3..541165c8d 100644
--- a/third_party/aom/av1/common/x86/idct_intrin_sse2.c
+++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
@@ -63,7 +63,7 @@ void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   __m128i in[2];
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 
   in[0] = load_input_data(input);
   in[1] = load_input_data(input + 8);
@@ -155,7 +155,7 @@ void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   __m128i in[8];
   const __m128i zero = _mm_setzero_si128();
   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 
   // load input data
   in[0] = load_input_data(input);
@@ -257,7 +257,7 @@ void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
   __m128i in[32];
   __m128i *in0 = &in[0];
   __m128i *in1 = &in[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 
   load_buffer_8x16(input, in0);
   input += 8;
@@ -393,7 +393,7 @@ static INLINE void flip_buffer_lr_8x8(__m128i *in) {
 void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride, const TxfmParam *txfm_param) {
   __m128i in[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 
   in[0] = load_input_data(input + 0 * 8);
   in[1] = load_input_data(input + 1 * 8);
@@ -559,7 +559,7 @@ static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
 void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride, const TxfmParam *txfm_param) {
   __m128i in[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 
   // Transpose 16x8 input into in[]
   in[0] = load_input_data(input + 0 * 16);
@@ -720,7 +720,7 @@ static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
 void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             const TxfmParam *txfm_param) {
   __m128i in[8];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 
   in[0] = load_input_data(input + 0 * 8);
   in[1] = load_input_data(input + 1 * 8);
@@ -905,7 +905,7 @@ static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
 void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             const TxfmParam *txfm_param) {
   __m128i in[8];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 
   // Load rows, packed two per element of 'in'.
   // We pack into the bottom half of 'in' so that the
@@ -1128,7 +1128,7 @@ static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
 void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride, const TxfmParam *txfm_param) {
   __m128i intl[16], intr[16], inbl[16], inbr[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 
   int i;
   for (i = 0; i < 16; ++i) {
@@ -1282,7 +1282,7 @@ static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
 void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride, const TxfmParam *txfm_param) {
   __m128i in0[16], in1[16], in2[16], in3[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
   int i;
 
   for (i = 0; i < 16; ++i) {
diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c
new file mode 100644
index 000000000..ea4acff33
--- /dev/null
+++ b/third_party/aom/av1/common/x86/intra_edge_sse4.c
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
+    { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
+    { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
+    { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
+  };
+
+  DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
+    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  };
+
+  // Extend the first and last samples to simplify the loop for the 5-tap case
+  p[-1] = p[0];
+  __m128i last = _mm_set1_epi8(p[sz - 1]);
+  _mm_storeu_si128((__m128i *)&p[sz], last);
+
+  // Adjust input pointer for filter support area
+  uint8_t *in = (strength == 3) ? p - 1 : p;
+
+  // Avoid modifying first/last samples
+  uint8_t *out = p + 1;
+  int len = sz - 2;
+
+  const int use_3tap_filter = (strength < 3);
+
+  if (use_3tap_filter) {
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
+    __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+      __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+      d0 = _mm_maddubs_epi16(d0, coef0);
+      d1 = _mm_maddubs_epi16(d1, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srai_epi16(d0, 4);
+      d0 = _mm_packus_epi16(d0, d0);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi8(n_out);
+      __m128i mask = _mm_cmpgt_epi8(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storel_epi64((__m128i *)out, out0);
+      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+      in0 = _mm_alignr_epi8(in1, in0, 8);
+      in += 8;
+      out += 8;
+      len -= n_out;
+    }
+  } else {  // 5-tap filter
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i two = _mm_set1_epi8(2);
+    __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
+    __m128i shuf_b = _mm_add_epi8(shuf_a, two);
+    __m128i shuf_c = _mm_add_epi8(shuf_b, two);
+    __m128i shuf_d = _mm_add_epi8(shuf_c, two);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
+      __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
+      __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
+      __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
+      d0 = _mm_maddubs_epi16(d0, coef0);
+      d1 = _mm_maddubs_epi16(d1, coef0);
+      d2 = _mm_maddubs_epi16(d2, coef0);
+      d3 = _mm_maddubs_epi16(d3, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      d2 = _mm_hadd_epi16(d2, d3);
+      d0 = _mm_hadd_epi16(d0, d2);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srai_epi16(d0, 4);
+      d0 = _mm_packus_epi16(d0, d0);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi8(n_out);
+      __m128i mask = _mm_cmpgt_epi8(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storel_epi64((__m128i *)out, out0);
+      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+      in0 = _mm_alignr_epi8(in1, in0, 8);
+      in += 8;
+      out += 8;
+      len -= n_out;
+    }
+  }
+}
+
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
+    { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
+    { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
+    { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
+  };
+
+  DECLARE_ALIGNED(16, static const int16_t,
+                  v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
+
+  // Extend the first and last samples to simplify the loop for the 5-tap case
+  p[-1] = p[0];
+  __m128i last = _mm_set1_epi16(p[sz - 1]);
+  _mm_storeu_si128((__m128i *)&p[sz], last);
+
+  // Adjust input pointer for filter support area
+  uint16_t *in = (strength == 3) ? p - 1 : p;
+
+  // Avoid modifying first/last samples
+  uint16_t *out = p + 1;
+  int len = sz - 2;
+
+  const int use_3tap_filter = (strength < 3);
+
+  if (use_3tap_filter) {
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+      __m128i in02 = _mm_add_epi16(in0, in2);
+      __m128i d0 = _mm_unpacklo_epi16(in02, in1);
+      __m128i d1 = _mm_unpackhi_epi16(in02, in1);
+      d0 = _mm_mullo_epi16(d0, coef0);
+      d1 = _mm_mullo_epi16(d1, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srli_epi16(d0, 4);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi16(n_out);
+      __m128i mask = _mm_cmpgt_epi16(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storeu_si128((__m128i *)out, out0);
+      in += 8;
+      in0 = in8;
+      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+      out += 8;
+      len -= n_out;
+    }
+  } else {  // 5-tap filter
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+      __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+      __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
+      __m128i in04 = _mm_add_epi16(in0, in4);
+      __m128i in123 = _mm_add_epi16(in1, in2);
+      in123 = _mm_add_epi16(in123, in3);
+      __m128i d0 = _mm_unpacklo_epi16(in04, in123);
+      __m128i d1 = _mm_unpackhi_epi16(in04, in123);
+      d0 = _mm_mullo_epi16(d0, coef0);
+      d1 = _mm_mullo_epi16(d1, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srli_epi16(d0, 4);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi16(n_out);
+      __m128i mask = _mm_cmpgt_epi16(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storeu_si128((__m128i *)out, out0);
+      in += 8;
+      in0 = in8;
+      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+      out += 8;
+      len -= n_out;
+    }
+  }
+}
+
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
+  // interpolate half-sample positions
+  assert(sz <= 24);
+
+  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
+    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
+  };
+
+  DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = {
+    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+  };
+
+  // Extend first/last samples (upper-left p[-1], last p[sz-1])
+  // to support 4-tap filter
+  p[-2] = p[-1];
+  p[sz] = p[sz - 1];
+
+  uint8_t *in = &p[-2];
+  uint8_t *out = &p[-2];
+
+  int n = sz + 1;  // Input length including upper-left sample
+
+  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+
+  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
+  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
+
+  while (n > 0) {
+    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
+    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
+    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
+    d0 = _mm_maddubs_epi16(d0, coef0);
+    d1 = _mm_maddubs_epi16(d1, coef0);
+    d2 = _mm_maddubs_epi16(d2, coef0);
+    d3 = _mm_maddubs_epi16(d3, coef0);
+    d0 = _mm_hadd_epi16(d0, d1);
+    d2 = _mm_hadd_epi16(d2, d3);
+    __m128i eight = _mm_set1_epi16(8);
+    d0 = _mm_add_epi16(d0, eight);
+    d2 = _mm_add_epi16(d2, eight);
+    d0 = _mm_srai_epi16(d0, 4);
+    d2 = _mm_srai_epi16(d2, 4);
+    d0 = _mm_packus_epi16(d0, d2);
+    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
+    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
+    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
+    _mm_storeu_si128((__m128i *)&out[0], out0);
+    _mm_storeu_si128((__m128i *)&out[16], out1);
+    in0 = in16;
+    in16 = _mm_setzero_si128();
+    out += 32;
+    n -= 16;
+  }
+}
+
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) {
+  // interpolate half-sample positions
+  assert(sz <= 24);
+
+  DECLARE_ALIGNED(16, static const int16_t,
+                  kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
+
+  // Extend first/last samples (upper-left p[-1], last p[sz-1])
+  // to support 4-tap filter
+  p[-2] = p[-1];
+  p[sz] = p[sz - 1];
+
+  uint16_t *in = &p[-2];
+  uint16_t *out = in;
+  int n = sz + 1;
+
+  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+  __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+  __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
+
+  while (n > 0) {
+    __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+    __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+    __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+    __m128i sum0 = _mm_add_epi16(in0, in3);
+    __m128i sum1 = _mm_add_epi16(in1, in2);
+    __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
+    __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
+    __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+    d0 = _mm_madd_epi16(d0, coef0);
+    d1 = _mm_madd_epi16(d1, coef0);
+    __m128i eight = _mm_set1_epi32(8);
+    d0 = _mm_add_epi32(d0, eight);
+    d1 = _mm_add_epi32(d1, eight);
+    d0 = _mm_srai_epi32(d0, 4);
+    d1 = _mm_srai_epi32(d1, 4);
+    d0 = _mm_packus_epi32(d0, d1);
+    __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
+    d0 = _mm_min_epi16(d0, max0);
+    __m128i out0 = _mm_unpacklo_epi16(in1, d0);
+    __m128i out1 = _mm_unpackhi_epi16(in1, d0);
+    _mm_storeu_si128((__m128i *)&out[0], out0);
+    _mm_storeu_si128((__m128i *)&out[8], out1);
+    in0 = in8;
+    in8 = in16;
+    in16 = in24;
+    in24 = _mm_setzero_si128();
+    out += 16;
+    n -= 8;
+  }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index e2e4f51c3..4006b8518 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -3,6 +3,7 @@
 #include "./aom_config.h"
 #include "./av1_rtcd.h"
 #include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
 
 /* Calculate four consecutive entries of the intermediate A and B arrays
    (corresponding to the first loop in the C version of
@@ -71,8 +72,8 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
     __m128i a, b, x, y, x2, y2;
     __m128i sum, sum_sq, tmp;
 
-    a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
-    b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+    a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
+    b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
 
     sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
     tmp = _mm_unpacklo_epi16(a, b);
@@ -81,7 +82,7 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&B[j], sum);
     _mm_store_si128((__m128i *)&A[j], sum_sq);
 
-    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
     sum = _mm_add_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -91,9 +92,9 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
       _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
 
       x = _mm_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+          xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
       y = _mm_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+          xx_loadl_32((__m128i *)&src[(i + 2) * src_stride + j]));
 
       sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
 
@@ -106,7 +107,7 @@ static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
 
     x = _mm_cvtepu8_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+        xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
     sum = _mm_sub_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -242,9 +243,9 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
     __m128i a, b, c, c2, x, y, x2, y2;
     __m128i sum, sum_sq, tmp;
 
-    a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
-    b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
-    c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+    a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
+    b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
+    c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
 
     sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
     // Important: Since c may be up to 2^8, the result on squaring may
@@ -256,7 +257,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&B[j], sum);
     _mm_store_si128((__m128i *)&A[j], sum_sq);
 
-    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[3 * src_stride + j]));
     sum = _mm_add_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -264,7 +265,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
     _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
 
-    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j]));
     sum = _mm_add_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -289,7 +290,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
 
     x = _mm_cvtepu8_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+        xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j]));
     sum = _mm_sub_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -298,7 +299,7 @@ static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
 
     x = _mm_cvtepu8_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+        xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
     sum = _mm_sub_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -443,10 +444,10 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
     __m128i a, b, c, d, x, y, x2, y2;
     __m128i sum, sum_sq, tmp, tmp2;
 
-    a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
-    b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
-    c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
-    d = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+    a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
+    b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
+    c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
+    d = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[3 * src_stride + j]));
 
     sum = _mm_cvtepi16_epi32(
         _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
@@ -458,7 +459,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&B[j], sum);
     _mm_store_si128((__m128i *)&A[j], sum_sq);
 
-    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j]));
     sum = _mm_add_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -466,7 +467,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
     _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
 
-    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
+    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[5 * src_stride + j]));
     sum = _mm_add_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -474,7 +475,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
     _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
 
-    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
+    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[6 * src_stride + j]));
     sum = _mm_add_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_add_epi32(sum_sq, x2);
@@ -483,10 +484,8 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
       _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
       _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
 
-      x = _mm_cvtepu8_epi32(
-          _mm_cvtsi32_si128(*((int *)&src[(i - 3) * src_stride + j])));
-      y = _mm_cvtepu8_epi32(
-          _mm_cvtsi32_si128(*((int *)&src[(i + 4) * src_stride + j])));
+      x = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i - 3) * src_stride + j]));
+      y = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i + 4) * src_stride + j]));
 
       sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
 
@@ -499,7 +498,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
 
     x = _mm_cvtepu8_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+        xx_loadl_32((__m128i *)&src[(i - 3) * src_stride + j]));
     sum = _mm_sub_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -508,7 +507,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
 
     x = _mm_cvtepu8_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+        xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j]));
     sum = _mm_sub_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -517,7 +516,7 @@ static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
     _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
 
     x = _mm_cvtepu8_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+        xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
     sum = _mm_sub_epi32(sum, x);
     x2 = _mm_mullo_epi32(x, x);
     sum_sq = _mm_sub_epi32(sum_sq, x2);
@@ -664,38 +663,48 @@ static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
 }
 
 void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
-                                       int stride, int32_t *dst, int dst_stride,
-                                       int r, int eps, int32_t *tmpbuf) {
-  int32_t *A = tmpbuf;
-  int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+                                       int dgd_stride, int32_t *dst,
+                                       int dst_stride, int r, int eps) {
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *A = A_;
+  int32_t *B = B_;
   int i, j;
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
   // leading to a significant speed improvement.
   // We also align the stride to a multiple of 16 bytes for efficiency.
-  int buf_stride = ((width + 3) & ~3) + 16;
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
 
   // Don't filter tiles with dimensions < 5 on any axis
   if ((width < 5) || (height < 5)) return;
 
+  uint8_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
   if (r == 1) {
-    selfguided_restoration_1_v(dgd, width, height, stride, A, B, buf_stride);
-    selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, 8);
+    selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+                               buf_stride);
+    selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
   } else if (r == 2) {
-    selfguided_restoration_2_v(dgd, width, height, stride, A, B, buf_stride);
-    selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, 8);
+    selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+                               buf_stride);
+    selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
   } else if (r == 3) {
-    selfguided_restoration_3_v(dgd, width, height, stride, A, B, buf_stride);
-    selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, 8);
+    selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+                               buf_stride);
+    selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
   } else {
     assert(0);
   }
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
 
   {
     i = 0;
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
@@ -707,7 +716,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
     }
     for (j = 1; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
@@ -720,7 +729,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
@@ -735,7 +744,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -751,7 +760,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
     // Vectorize the innermost loop
     for (j = 1; j < width - 1; j += 4) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 5;
 
@@ -804,7 +813,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
     // (typically have 2 such pixels, but may have anywhere between 0 and 3)
     for (; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 5;
       const int32_t a =
@@ -826,7 +835,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -845,7 +854,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
@@ -857,7 +866,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
     }
     for (j = 1; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
@@ -870,7 +879,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
@@ -1051,7 +1060,6 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
   int xq[2];
   int32_t *flt1 = tmpbuf;
   int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
   int i, j;
   assert(width * height <= RESTORATION_TILEPELS_MAX);
 #if USE_HIGHPASS_IN_SGRPROJ
@@ -1059,12 +1067,10 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
                              sgr_params[eps].corner, sgr_params[eps].edge);
 #else
     av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
-                                      sgr_params[eps].r1, sgr_params[eps].e1,
-                                      tmpbuf2);
+                                      sgr_params[eps].r1, sgr_params[eps].e1);
 #endif  // USE_HIGHPASS_IN_SGRPROJ
   av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
-                                    sgr_params[eps].r2, sgr_params[eps].e2,
-                                    tmpbuf2);
+                                    sgr_params[eps].r2, sgr_params[eps].e2);
   decode_xq(xqd, xq);
 
   __m128i xq0 = _mm_set1_epi32(xq[0]);
@@ -1364,43 +1370,52 @@ static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
 }
 
 void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
-                                              int height, int stride,
+                                              int height, int dgd_stride,
                                               int32_t *dst, int dst_stride,
-                                              int bit_depth, int r, int eps,
-                                              int32_t *tmpbuf) {
-  int32_t *A = tmpbuf;
-  int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+                                              int bit_depth, int r, int eps) {
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *A = A_;
+  int32_t *B = B_;
   int i, j;
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
   // leading to a significant speed improvement.
   // We also align the stride to a multiple of 16 bytes for efficiency.
-  int buf_stride = ((width + 3) & ~3) + 16;
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
 
   // Don't filter tiles with dimensions < 5 on any axis
   if ((width < 5) || (height < 5)) return;
 
+  uint16_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
   if (r == 1) {
-    highbd_selfguided_restoration_1_v(dgd, width, height, stride, A, B,
-                                      buf_stride);
-    selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, bit_depth);
+    highbd_selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride,
+                                      A, B, buf_stride);
+    selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps,
+                               bit_depth);
   } else if (r == 2) {
-    highbd_selfguided_restoration_2_v(dgd, width, height, stride, A, B,
-                                      buf_stride);
-    selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, bit_depth);
+    highbd_selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride,
+                                      A, B, buf_stride);
+    selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps,
+                               bit_depth);
   } else if (r == 3) {
-    highbd_selfguided_restoration_3_v(dgd, width, height, stride, A, B,
-                                      buf_stride);
-    selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, bit_depth);
+    highbd_selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride,
+                                      A, B, buf_stride);
+    selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps,
+                               bit_depth);
   } else {
     assert(0);
   }
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
 
   {
     i = 0;
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
@@ -1412,7 +1427,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
     }
     for (j = 1; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
@@ -1425,7 +1440,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
@@ -1440,7 +1455,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -1456,7 +1471,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
     // Vectorize the innermost loop
     for (j = 1; j < width - 1; j += 4) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 5;
 
@@ -1509,7 +1524,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
     // (typically have 2 such pixels, but may have anywhere between 0 and 3)
     for (; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 5;
       const int32_t a =
@@ -1531,7 +1546,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -1550,7 +1565,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
@@ -1562,7 +1577,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
     }
     for (j = 1; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
@@ -1575,7 +1590,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
@@ -1725,7 +1740,6 @@ void apply_selfguided_restoration_highbd_sse4_1(
   int xq[2];
   int32_t *flt1 = tmpbuf;
   int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
   int i, j;
   assert(width * height <= RESTORATION_TILEPELS_MAX);
 #if USE_HIGHPASS_IN_SGRPROJ
@@ -1735,11 +1749,11 @@ void apply_selfguided_restoration_highbd_sse4_1(
 #else
   av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
                                            width, bit_depth, sgr_params[eps].r1,
-                                           sgr_params[eps].e1, tmpbuf2);
+                                           sgr_params[eps].e1);
 #endif  // USE_HIGHPASS_IN_SGRPROJ
   av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
                                            width, bit_depth, sgr_params[eps].r2,
-                                           sgr_params[eps].e2, tmpbuf2);
+                                           sgr_params[eps].e2);
   decode_xq(xqd, xq);
 
   __m128i xq0 = _mm_set1_epi32(xq[0]);
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
index 5a22d9abf..d30466ae6 100644
--- a/third_party/aom/av1/common/x86/warp_plane_sse2.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse2.c
@@ -24,6 +24,20 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
   __m128i tmp[15];
   int i, j, k;
   const int bd = 8;
+#if CONFIG_CONVOLVE_ROUND
+  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+  const int reduce_bits_horiz =
+      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+  const int offset_bits_horiz =
+      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
+  if (use_conv_params) {
+    conv_params->do_post_rounding = 1;
+  }
+  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
+#else
+  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
+  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
+#endif
 
   /* Note: For this code to work, the left/right frame borders need to be
      extended by at least 13 pixels each. By the time we get here, other
@@ -39,30 +53,17 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
 
   for (i = 0; i < p_height; i += 8) {
     for (j = 0; j < p_width; j += 8) {
-      // (x, y) coordinates of the center of this block in the destination
-      // image
-      const int32_t dst_x = p_col + j + 4;
-      const int32_t dst_y = p_row + i + 4;
-
-      int32_t x4, y4, ix4, sx4, iy4, sy4;
-      if (subsampling_x)
-        x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
-              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
-
-      if (subsampling_y)
-        y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
-              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
-
-      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       // Add in all the constant terms, including rounding and offset
       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
@@ -149,9 +150,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
           // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
           const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
-          const __m128i round_const =
-              _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
-                             ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
+          const __m128i round_const = _mm_set1_epi32(
+              (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
 
           // Calculate filtered results
           const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
@@ -165,8 +165,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
 
           __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                            _mm_add_epi32(res_2, res_6));
-          res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                    HORSHEAR_REDUCE_PREC_BITS);
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+                                   _mm_cvtsi32_si128(reduce_bits_horiz));
 
           // Filter odd-index pixels
           const __m128i tmp_1 = _mm_loadu_si128(
@@ -203,8 +203,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
 
           __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                           _mm_add_epi32(res_3, res_7));
-          res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                   HORSHEAR_REDUCE_PREC_BITS);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
+                                  _mm_cvtsi32_si128(reduce_bits_horiz));
 
           // Combine results into one register.
           // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
@@ -295,37 +295,63 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
                                               _mm_add_epi32(res_5, res_7));
 
         // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        // Round and pack into 8 bits
-        const __m128i round_const =
-            _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                           ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
-
-        const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
-        const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
-
-        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-        // Store, blending with 'pred' if needed
-        __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-        // Note: If we're outputting a 4x4 block, we need to be very careful
-        // to only output 4 pixels at this point, to avoid encode/decode
-        // mismatches when encoding with multiple threads.
-        if (p_width == 4) {
-          if (comp_avg) {
-            const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
-            res_8bit = _mm_avg_epu8(res_8bit, orig);
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+#if CONFIG_CONVOLVE_ROUND
+        if (use_conv_params) {
+          __m128i *const p =
+              (__m128i *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+          const __m128i round_const = _mm_set1_epi32(
+              -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
+              ((1 << (conv_params->round_1)) >> 1));
+          res_lo = _mm_add_epi32(res_lo, round_const);
+          res_lo =
+              _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
+          if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
+          _mm_storeu_si128(p, res_lo);
+          if (p_width > 4) {
+            res_hi = _mm_add_epi32(res_hi, round_const);
+            res_hi =
+                _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
+            if (comp_avg)
+              res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
+            _mm_storeu_si128(p + 1, res_hi);
           }
-          *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
         } else {
-          if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
-          _mm_storel_epi64(p, res_8bit);
+#else
+        {
+#endif
+          // Round and pack into 8 bits
+          const __m128i round_const =
+              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+
+          const __m128i res_lo_round = _mm_srai_epi32(
+              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+          const __m128i res_hi_round = _mm_srai_epi32(
+              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+          const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+          __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+          // Store, blending with 'pred' if needed
+          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+          // Note: If we're outputting a 4x4 block, we need to be very careful
+          // to only output 4 pixels at this point, to avoid encode/decode
+          // mismatches when encoding with multiple threads.
+          if (p_width == 4) {
+            if (comp_avg) {
+              const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
+              res_8bit = _mm_avg_epu8(res_8bit, orig);
+            }
+            *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+          } else {
+            if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
+            _mm_storel_epi64(p, res_8bit);
+          }
         }
       }
     }
diff --git a/third_party/aom/av1/common/x86/warp_plane_ssse3.c b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
index f8e6f62ba..3986ad389 100644
--- a/third_party/aom/av1/common/x86/warp_plane_ssse3.c
+++ b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
@@ -211,6 +211,20 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
   __m128i tmp[15];
   int i, j, k;
   const int bd = 8;
+#if CONFIG_CONVOLVE_ROUND
+  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+  const int reduce_bits_horiz =
+      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+  const int offset_bits_horiz =
+      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
+  if (use_conv_params) {
+    conv_params->do_post_rounding = 1;
+  }
+  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
+#else
+  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
+  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
+#endif
 
   /* Note: For this code to work, the left/right frame borders need to be
      extended by at least 13 pixels each. By the time we get here, other
@@ -226,30 +240,17 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
 
   for (i = 0; i < p_height; i += 8) {
     for (j = 0; j < p_width; j += 8) {
-      // (x, y) coordinates of the center of this block in the destination
-      // image
-      const int32_t dst_x = p_col + j + 4;
-      const int32_t dst_y = p_row + i + 4;
-
-      int32_t x4, y4, ix4, sx4, iy4, sy4;
-      if (subsampling_x)
-        x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
-              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
-
-      if (subsampling_y)
-        y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
-              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
-
-      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       // Add in all the constant terms, including rounding and offset
       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
@@ -369,9 +370,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
               _mm_srli_si128(src_odd, 4), _mm_srli_si128(src_even, 6));
           const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
 
-          const __m128i round_const =
-              _mm_set1_epi16((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
-                             ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
+          const __m128i round_const = _mm_set1_epi16(
+              (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
 
           // Note: The values res_02 + res_46 and res_13 + res_57 both
           // fit into int16s at this point, but their sum may be too wide to fit
@@ -385,7 +385,7 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
           const __m128i res_odd = _mm_add_epi16(res_13, res_57);
           const __m128i res =
               _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
-          tmp[k + 7] = _mm_srli_epi16(res, HORSHEAR_REDUCE_PREC_BITS);
+          tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
         }
       }
 
@@ -471,37 +471,63 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
                                               _mm_add_epi32(res_5, res_7));
 
         // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        // Round and pack into 8 bits
-        const __m128i round_const =
-            _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                           ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
-
-        const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
-        const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
-
-        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-        // Store, blending with 'pred' if needed
-        __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-        // Note: If we're outputting a 4x4 block, we need to be very careful
-        // to only output 4 pixels at this point, to avoid encode/decode
-        // mismatches when encoding with multiple threads.
-        if (p_width == 4) {
-          if (comp_avg) {
-            const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
-            res_8bit = _mm_avg_epu8(res_8bit, orig);
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+#if CONFIG_CONVOLVE_ROUND
+        if (use_conv_params) {
+          __m128i *const p =
+              (__m128i *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+          const __m128i round_const = _mm_set1_epi32(
+              -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
+              ((1 << (conv_params->round_1)) >> 1));
+          res_lo = _mm_add_epi32(res_lo, round_const);
+          res_lo =
+              _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
+          if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
+          _mm_storeu_si128(p, res_lo);
+          if (p_width > 4) {
+            res_hi = _mm_add_epi32(res_hi, round_const);
+            res_hi =
+                _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
+            if (comp_avg)
+              res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
+            _mm_storeu_si128(p + 1, res_hi);
           }
-          *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
         } else {
-          if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
-          _mm_storel_epi64(p, res_8bit);
+#else
+        {
+#endif
+          // Round and pack into 8 bits
+          const __m128i round_const =
+              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+
+          const __m128i res_lo_round = _mm_srai_epi32(
+              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+          const __m128i res_hi_round = _mm_srai_epi32(
+              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+          const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+          __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+          // Store, blending with 'pred' if needed
+          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+          // Note: If we're outputting a 4x4 block, we need to be very careful
+          // to only output 4 pixels at this point, to avoid encode/decode
+          // mismatches when encoding with multiple threads.
+          if (p_width == 4) {
+            if (comp_avg) {
+              const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
+              res_8bit = _mm_avg_epu8(res_8bit, orig);
+            }
+            *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+          } else {
+            if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
+            _mm_storel_epi64(p, res_8bit);
+          }
         }
       }
     }