Update aom to slightly newer commit ID

author: trav90 <travawine@palemoon.org> 2018-10-17 05:59:08 -0500
committer: trav90 <travawine@palemoon.org> 2018-10-17 05:59:08 -0500
commit: df9477dfa60ebb5d31bc142e58ce46535c17abce (patch)
tree: c4fdd5d1b09d08c0514f208246260fc87372cb56 /third_party/aom/av1/common/x86
parent: 0cc51bc106250988cc3b89cb5d743a5af52cd35a (diff)
download: UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar
UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.gz
UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.lz
UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.xz
UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.zip
12 files changed, 1319 insertions, 678 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
index 91102bbaf..5e627ebcf 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
@@ -19,13 +19,13 @@
 #define WIDTH_BOUND (16)
 #define HEIGHT_BOUND (16)
 
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
 DECLARE_ALIGNED(16, static int8_t,
                 sub_pel_filters_12sharp_signal_dir[15][2][16]);
 
 DECLARE_ALIGNED(16, static int8_t,
                 sub_pel_filters_12sharp_ver_signal_dir[15][6][16]);
-#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
 
 #if USE_TEMPORALFILTER_12TAP
 DECLARE_ALIGNED(16, static int8_t,
@@ -39,7 +39,7 @@ typedef int8_t (*SubpelFilterCoeffs)[16];
 
 static INLINE SubpelFilterCoeffs
 get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
   if (p.interp_filter == MULTITAP_SHARP) {
     return &sub_pel_filters_12sharp_signal_dir[index][0];
   }
@@ -56,7 +56,7 @@ get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
 
 static INLINE SubpelFilterCoeffs
 get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
   if (p.interp_filter == MULTITAP_SHARP) {
     return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
   }
@@ -143,6 +143,7 @@ static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
   const __m128i k_256 = _mm_set1_epi16(1 << 8);
   const __m128i zero = _mm_setzero_si128();
 
+  assert(tapsNum == 10 || tapsNum == 12);
   if (10 == tapsNum) {
     src -= 1;
   }
@@ -470,6 +471,7 @@ static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
   __m128i min_x2x3, max_x2x3;
   __m128i temp;
 
+  assert(tapsNum == 10 || tapsNum == 12);
   if (tapsNum == 10) {
     src_ptr -= 1;
   }
@@ -612,6 +614,7 @@ static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
   __m128i x0, x1, x2, x3, x4, x5;
   __m128i min_x2x3, max_x2x3, temp;
 
+  assert(tapsNum == 10 || tapsNum == 12);
   if (tapsNum == 10) {
     src_ptr -= 1;
   }
@@ -982,7 +985,7 @@ typedef struct SimdFilter {
   int8_t (*simd_vert_filter)[6][16];
 } SimdFilter;
 
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
 #define MULTITAP_FILTER_NUM 1
 SimdFilter simd_filters[MULTITAP_FILTER_NUM] = {
   { MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0],
@@ -1010,7 +1013,7 @@ void av1_lowbd_convolve_init_ssse3(void) {
                           temporal_simd_filter.simd_vert_filter);
   }
 #endif
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
   {
     int i;
     for (i = 0; i < MULTITAP_FILTER_NUM; ++i) {
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
index d04b667f1..97d2e74b1 100644
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -57,7 +57,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 2
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
     buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
     buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
@@ -94,7 +94,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 3
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
     buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
     buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
@@ -131,7 +131,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 4
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
     buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
     buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
@@ -168,7 +168,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 5
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
                         buf1[1], bit);
     btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
@@ -205,7 +205,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 6
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
     buf0[2] = buf1[2];
@@ -242,7 +242,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 7
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf1[0] = buf0[0];
     buf1[1] = buf0[1];
     buf1[2] = buf0[2];
@@ -279,7 +279,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 8
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
     buf0[2] = buf1[2];
@@ -383,7 +383,7 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 2
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
                         bit);
     btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
@@ -399,7 +399,7 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 4
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
@@ -475,7 +475,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 2
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
                         bit);
     btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
@@ -547,7 +547,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 4
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
     buf0[2] = buf1[2];
@@ -619,7 +619,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 6
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
     buf0[2] = buf1[2];
@@ -691,7 +691,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 8
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
     buf0[2] = buf1[2];
@@ -763,7 +763,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
     // stage 10
     stage_idx++;
     bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
+    cospi = cospi_arr(bit);
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
index 78c261374..1d7c55349 100644
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -37,16 +37,20 @@ static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
 }
 
 static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
-                                     const int stride, const TXFM_2D_CFG *cfg,
+                                     const int stride,
+                                     const TXFM_2D_FLIP_CFG *cfg,
                                      int32_t *txfm_buf) {
-  const int txfm_size = cfg->txfm_size;
-  const int8_t *shift = cfg->shift;
-  const int8_t *stage_range_col = cfg->stage_range_col;
-  const int8_t *stage_range_row = cfg->stage_range_row;
-  const int8_t *cos_bit_col = cfg->cos_bit_col;
-  const int8_t *cos_bit_row = cfg->cos_bit_row;
-  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
-  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+  // TODO(sarahparker) must correct for rectangular transforms in follow up
+  const int txfm_size = cfg->row_cfg->txfm_size;
+  const int8_t *shift = cfg->row_cfg->shift;
+  const int8_t *stage_range_col = cfg->col_cfg->stage_range;
+  const int8_t *stage_range_row = cfg->row_cfg->stage_range;
+  const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
+  const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
+  const TxfmFuncSSE2 txfm_func_col =
+      fwd_txfm_type_to_func(cfg->col_cfg->txfm_type);
+  const TxfmFuncSSE2 txfm_func_row =
+      fwd_txfm_type_to_func(cfg->row_cfg->txfm_type);
 
   __m128i *buf_128 = (__m128i *)txfm_buf;
   __m128i *out_128 = (__m128i *)output;
@@ -69,7 +73,7 @@ void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
   DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
   TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
   (void)bd;
-  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+  fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
 }
 
 void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
@@ -77,5 +81,5 @@ void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
   DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
   TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
   (void)bd;
-  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+  fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
 }
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
index cf6249bdc..68461bc36 100644
--- a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
@@ -15,7 +15,7 @@
 #include "./av1_rtcd.h"
 #include "av1/common/filter.h"
 
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
 DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]);
 #endif
 
@@ -31,7 +31,7 @@ typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
 
 static INLINE HbdSubpelFilterCoeffs
 hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
   if (p.interp_filter == MULTITAP_SHARP) {
     return &subpel_filters_sharp[index][0];
   }
@@ -76,7 +76,7 @@ void av1_highbd_convolve_init_sse4_1(void) {
     init_simd_filter(filter_ptr, taps, subpel_temporalfilter);
   }
 #endif
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
   {
     InterpFilterParams filter_params =
         av1_get_interp_filter_params(MULTITAP_SHARP);
@@ -246,6 +246,7 @@ static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
                                 int tapsNum, uint32_t *buf) {
   __m128i u[8], v[6];
 
+  assert(tapsNum == 10 || tapsNum == 12);
   if (tapsNum == 10) {
     src -= 1;
   }
@@ -412,6 +413,7 @@ static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
   int r = 0;
 
   // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
+  assert(taps == 10 || taps == 12);
   if (10 == taps) {
     i += 1;
     s[0] = zero;
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
index d10f1ccc2..dd2a681bc 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -13,7 +13,7 @@
 
 #include "./av1_rtcd.h"
 #include "./aom_config.h"
-#include "av1/common/av1_inv_txfm2d_cfg.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
 
 // Note:
 //  Total 32x4 registers to represent 32x32 block coefficients.
@@ -154,20 +154,21 @@ static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
   }
 }
 
-static INLINE __m256i half_btf_avx2(__m256i w0, __m256i n0, __m256i w1,
-                                    __m256i n1, __m256i rounding, int bit) {
+static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
+                                    const __m256i *w1, const __m256i *n1,
+                                    const __m256i *rounding, int bit) {
   __m256i x, y;
 
-  x = _mm256_mullo_epi32(w0, n0);
-  y = _mm256_mullo_epi32(w1, n1);
+  x = _mm256_mullo_epi32(*w0, *n0);
+  y = _mm256_mullo_epi32(*w1, *n1);
   x = _mm256_add_epi32(x, y);
-  x = _mm256_add_epi32(x, rounding);
+  x = _mm256_add_epi32(x, *rounding);
   x = _mm256_srai_epi32(x, bit);
   return x;
 }
 
 static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
   const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
   const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
@@ -275,22 +276,38 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[13] = bf1[13];
     bf0[14] = bf1[14];
     bf0[15] = bf1[15];
-    bf0[16] = half_btf_avx2(cospi62, bf1[16], cospim2, bf1[31], rounding, bit);
-    bf0[17] = half_btf_avx2(cospi30, bf1[17], cospim34, bf1[30], rounding, bit);
-    bf0[18] = half_btf_avx2(cospi46, bf1[18], cospim18, bf1[29], rounding, bit);
-    bf0[19] = half_btf_avx2(cospi14, bf1[19], cospim50, bf1[28], rounding, bit);
-    bf0[20] = half_btf_avx2(cospi54, bf1[20], cospim10, bf1[27], rounding, bit);
-    bf0[21] = half_btf_avx2(cospi22, bf1[21], cospim42, bf1[26], rounding, bit);
-    bf0[22] = half_btf_avx2(cospi38, bf1[22], cospim26, bf1[25], rounding, bit);
-    bf0[23] = half_btf_avx2(cospi6, bf1[23], cospim58, bf1[24], rounding, bit);
-    bf0[24] = half_btf_avx2(cospi58, bf1[23], cospi6, bf1[24], rounding, bit);
-    bf0[25] = half_btf_avx2(cospi26, bf1[22], cospi38, bf1[25], rounding, bit);
-    bf0[26] = half_btf_avx2(cospi42, bf1[21], cospi22, bf1[26], rounding, bit);
-    bf0[27] = half_btf_avx2(cospi10, bf1[20], cospi54, bf1[27], rounding, bit);
-    bf0[28] = half_btf_avx2(cospi50, bf1[19], cospi14, bf1[28], rounding, bit);
-    bf0[29] = half_btf_avx2(cospi18, bf1[18], cospi46, bf1[29], rounding, bit);
-    bf0[30] = half_btf_avx2(cospi34, bf1[17], cospi30, bf1[30], rounding, bit);
-    bf0[31] = half_btf_avx2(cospi2, bf1[16], cospi62, bf1[31], rounding, bit);
+    bf0[16] =
+        half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+    bf0[17] =
+        half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+    bf0[18] =
+        half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+    bf0[19] =
+        half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+    bf0[20] =
+        half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+    bf0[21] =
+        half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+    bf0[22] =
+        half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+    bf0[23] =
+        half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+    bf0[24] =
+        half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+    bf0[25] =
+        half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+    bf0[26] =
+        half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+    bf0[27] =
+        half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+    bf0[28] =
+        half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+    bf0[29] =
+        half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+    bf0[30] =
+        half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+    bf0[31] =
+        half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
 
     // stage 3
     bf1[0] = bf0[0];
@@ -301,14 +318,22 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf1[5] = bf0[5];
     bf1[6] = bf0[6];
     bf1[7] = bf0[7];
-    bf1[8] = half_btf_avx2(cospi60, bf0[8], cospim4, bf0[15], rounding, bit);
-    bf1[9] = half_btf_avx2(cospi28, bf0[9], cospim36, bf0[14], rounding, bit);
-    bf1[10] = half_btf_avx2(cospi44, bf0[10], cospim20, bf0[13], rounding, bit);
-    bf1[11] = half_btf_avx2(cospi12, bf0[11], cospim52, bf0[12], rounding, bit);
-    bf1[12] = half_btf_avx2(cospi52, bf0[11], cospi12, bf0[12], rounding, bit);
-    bf1[13] = half_btf_avx2(cospi20, bf0[10], cospi44, bf0[13], rounding, bit);
-    bf1[14] = half_btf_avx2(cospi36, bf0[9], cospi28, bf0[14], rounding, bit);
-    bf1[15] = half_btf_avx2(cospi4, bf0[8], cospi60, bf0[15], rounding, bit);
+    bf1[8] =
+        half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+    bf1[9] =
+        half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+    bf1[10] =
+        half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+    bf1[11] =
+        half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+    bf1[12] =
+        half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+    bf1[13] =
+        half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+    bf1[14] =
+        half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+    bf1[15] =
+        half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
     bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
     bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
     bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
@@ -331,10 +356,13 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[1] = bf1[1];
     bf0[2] = bf1[2];
     bf0[3] = bf1[3];
-    bf0[4] = half_btf_avx2(cospi56, bf1[4], cospim8, bf1[7], rounding, bit);
-    bf0[5] = half_btf_avx2(cospi24, bf1[5], cospim40, bf1[6], rounding, bit);
-    bf0[6] = half_btf_avx2(cospi40, bf1[5], cospi24, bf1[6], rounding, bit);
-    bf0[7] = half_btf_avx2(cospi8, bf1[4], cospi56, bf1[7], rounding, bit);
+    bf0[4] =
+        half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+    bf0[5] =
+        half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+    bf0[6] =
+        half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+    bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
     bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
     bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
     bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
@@ -344,40 +372,54 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
     bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
     bf0[16] = bf1[16];
-    bf0[17] = half_btf_avx2(cospim8, bf1[17], cospi56, bf1[30], rounding, bit);
-    bf0[18] = half_btf_avx2(cospim56, bf1[18], cospim8, bf1[29], rounding, bit);
+    bf0[17] =
+        half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+    bf0[18] =
+        half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
     bf0[19] = bf1[19];
     bf0[20] = bf1[20];
-    bf0[21] = half_btf_avx2(cospim40, bf1[21], cospi24, bf1[26], rounding, bit);
+    bf0[21] =
+        half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
     bf0[22] =
-        half_btf_avx2(cospim24, bf1[22], cospim40, bf1[25], rounding, bit);
+        half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
     bf0[23] = bf1[23];
     bf0[24] = bf1[24];
-    bf0[25] = half_btf_avx2(cospim40, bf1[22], cospi24, bf1[25], rounding, bit);
-    bf0[26] = half_btf_avx2(cospi24, bf1[21], cospi40, bf1[26], rounding, bit);
+    bf0[25] =
+        half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+    bf0[26] =
+        half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
     bf0[27] = bf1[27];
     bf0[28] = bf1[28];
-    bf0[29] = half_btf_avx2(cospim8, bf1[18], cospi56, bf1[29], rounding, bit);
-    bf0[30] = half_btf_avx2(cospi56, bf1[17], cospi8, bf1[30], rounding, bit);
+    bf0[29] =
+        half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+    bf0[30] =
+        half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
     bf0[31] = bf1[31];
 
     // stage 5
-    bf1[0] = half_btf_avx2(cospi32, bf0[0], cospi32, bf0[1], rounding, bit);
-    bf1[1] = half_btf_avx2(cospi32, bf0[0], cospim32, bf0[1], rounding, bit);
-    bf1[2] = half_btf_avx2(cospi48, bf0[2], cospim16, bf0[3], rounding, bit);
-    bf1[3] = half_btf_avx2(cospi16, bf0[2], cospi48, bf0[3], rounding, bit);
+    bf1[0] =
+        half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+    bf1[1] =
+        half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+    bf1[2] =
+        half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+    bf1[3] =
+        half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
     bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
     bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
     bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
     bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
     bf1[8] = bf0[8];
-    bf1[9] = half_btf_avx2(cospim16, bf0[9], cospi48, bf0[14], rounding, bit);
+    bf1[9] =
+        half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
     bf1[10] =
-        half_btf_avx2(cospim48, bf0[10], cospim16, bf0[13], rounding, bit);
+        half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
     bf1[11] = bf0[11];
     bf1[12] = bf0[12];
-    bf1[13] = half_btf_avx2(cospim16, bf0[10], cospi48, bf0[13], rounding, bit);
-    bf1[14] = half_btf_avx2(cospi48, bf0[9], cospi16, bf0[14], rounding, bit);
+    bf1[13] =
+        half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+    bf1[14] =
+        half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
     bf1[15] = bf0[15];
     bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
     bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
@@ -402,8 +444,10 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
     bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
     bf0[4] = bf1[4];
-    bf0[5] = half_btf_avx2(cospim32, bf1[5], cospi32, bf1[6], rounding, bit);
-    bf0[6] = half_btf_avx2(cospi32, bf1[5], cospi32, bf1[6], rounding, bit);
+    bf0[5] =
+        half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+    bf0[6] =
+        half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
     bf0[7] = bf1[7];
     bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
     bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
@@ -415,20 +459,26 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
     bf0[16] = bf1[16];
     bf0[17] = bf1[17];
-    bf0[18] = half_btf_avx2(cospim16, bf1[18], cospi48, bf1[29], rounding, bit);
-    bf0[19] = half_btf_avx2(cospim16, bf1[19], cospi48, bf1[28], rounding, bit);
+    bf0[18] =
+        half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+    bf0[19] =
+        half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
     bf0[20] =
-        half_btf_avx2(cospim48, bf1[20], cospim16, bf1[27], rounding, bit);
+        half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
     bf0[21] =
-        half_btf_avx2(cospim48, bf1[21], cospim16, bf1[26], rounding, bit);
+        half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
     bf0[22] = bf1[22];
     bf0[23] = bf1[23];
     bf0[24] = bf1[24];
     bf0[25] = bf1[25];
-    bf0[26] = half_btf_avx2(cospim16, bf1[21], cospi48, bf1[26], rounding, bit);
-    bf0[27] = half_btf_avx2(cospim16, bf1[20], cospi48, bf1[27], rounding, bit);
-    bf0[28] = half_btf_avx2(cospi48, bf1[19], cospi16, bf1[28], rounding, bit);
-    bf0[29] = half_btf_avx2(cospi48, bf1[18], cospi16, bf1[29], rounding, bit);
+    bf0[26] =
+        half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+    bf0[27] =
+        half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+    bf0[28] =
+        half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+    bf0[29] =
+        half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
     bf0[30] = bf1[30];
     bf0[31] = bf1[31];
 
@@ -443,10 +493,14 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
     bf1[8] = bf0[8];
     bf1[9] = bf0[9];
-    bf1[10] = half_btf_avx2(cospim32, bf0[10], cospi32, bf0[13], rounding, bit);
-    bf1[11] = half_btf_avx2(cospim32, bf0[11], cospi32, bf0[12], rounding, bit);
-    bf1[12] = half_btf_avx2(cospi32, bf0[11], cospi32, bf0[12], rounding, bit);
-    bf1[13] = half_btf_avx2(cospi32, bf0[10], cospi32, bf0[13], rounding, bit);
+    bf1[10] =
+        half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+    bf1[11] =
+        half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+    bf1[12] =
+        half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+    bf1[13] =
+        half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
     bf1[14] = bf0[14];
     bf1[15] = bf0[15];
     bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
@@ -487,14 +541,22 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[17] = bf1[17];
     bf0[18] = bf1[18];
     bf0[19] = bf1[19];
-    bf0[20] = half_btf_avx2(cospim32, bf1[20], cospi32, bf1[27], rounding, bit);
-    bf0[21] = half_btf_avx2(cospim32, bf1[21], cospi32, bf1[26], rounding, bit);
-    bf0[22] = half_btf_avx2(cospim32, bf1[22], cospi32, bf1[25], rounding, bit);
-    bf0[23] = half_btf_avx2(cospim32, bf1[23], cospi32, bf1[24], rounding, bit);
-    bf0[24] = half_btf_avx2(cospi32, bf1[23], cospi32, bf1[24], rounding, bit);
-    bf0[25] = half_btf_avx2(cospi32, bf1[22], cospi32, bf1[25], rounding, bit);
-    bf0[26] = half_btf_avx2(cospi32, bf1[21], cospi32, bf1[26], rounding, bit);
-    bf0[27] = half_btf_avx2(cospi32, bf1[20], cospi32, bf1[27], rounding, bit);
+    bf0[20] =
+        half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+    bf0[21] =
+        half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+    bf0[22] =
+        half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+    bf0[23] =
+        half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+    bf0[24] =
+        half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+    bf0[25] =
+        half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+    bf0[26] =
+        half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+    bf0[27] =
+        half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
     bf0[28] = bf1[28];
     bf0[29] = bf1[29];
     bf0[30] = bf1[30];
@@ -539,18 +601,20 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
 void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
                                    int stride, int tx_type, int bd) {
   __m256i in[128], out[128];
-  const TXFM_2D_CFG *cfg = NULL;
+  const TXFM_1D_CFG *row_cfg = NULL;
+  const TXFM_1D_CFG *col_cfg = NULL;
 
   switch (tx_type) {
     case DCT_DCT:
-      cfg = &inv_txfm_2d_cfg_dct_dct_32;
+      row_cfg = &inv_txfm_1d_row_cfg_dct_32;
+      col_cfg = &inv_txfm_1d_col_cfg_dct_32;
       load_buffer_32x32(coeff, in);
       transpose_32x32(in, out);
-      idct32_avx2(out, in, cfg->cos_bit_row[2]);
-      round_shift_32x32(in, -cfg->shift[0]);
+      idct32_avx2(out, in, row_cfg->cos_bit[2]);
+      round_shift_32x32(in, -row_cfg->shift[0]);
       transpose_32x32(in, out);
-      idct32_avx2(out, in, cfg->cos_bit_col[2]);
-      write_buffer_32x32(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      idct32_avx2(out, in, col_cfg->cos_bit[2]);
+      write_buffer_32x32(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
     default: assert(0);
   }
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
index 24b2760b9..a93699f0b 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -13,7 +13,7 @@
 
 #include "./av1_rtcd.h"
 #include "./aom_config.h"
-#include "av1/common/av1_inv_txfm2d_cfg.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 
 static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
@@ -24,7 +24,7 @@ static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
 }
 
 static void idct4x4_sse4_1(__m128i *in, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
@@ -72,7 +72,7 @@ static void idct4x4_sse4_1(__m128i *in, int bit) {
 }
 
 static void iadst4x4_sse4_1(__m128i *in, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
@@ -232,72 +232,82 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
                                    int stride, int tx_type, int bd) {
   __m128i in[4];
-  const TXFM_2D_CFG *cfg = NULL;
+  const TXFM_1D_CFG *row_cfg = NULL;
+  const TXFM_1D_CFG *col_cfg = NULL;
 
   switch (tx_type) {
     case DCT_DCT:
-      cfg = &inv_txfm_2d_cfg_dct_dct_4;
+      row_cfg = &inv_txfm_1d_row_cfg_dct_4;
+      col_cfg = &inv_txfm_1d_col_cfg_dct_4;
       load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
     case ADST_DCT:
-      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      row_cfg = &inv_txfm_1d_row_cfg_dct_4;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
     case DCT_ADST:
-      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+      col_cfg = &inv_txfm_1d_col_cfg_dct_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
     case ADST_ADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      row_cfg = &inv_txfm_1d_row_cfg_dct_4;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
       break;
     case DCT_FLIPADST:
-      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+      col_cfg = &inv_txfm_1d_col_cfg_dct_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      write_buffer_4x4(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      write_buffer_4x4(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
       break;
     case ADST_FLIPADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
       break;
     case FLIPADST_ADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
       break;
 #endif  // CONFIG_EXT_TX
     default: assert(0);
@@ -325,7 +335,7 @@ static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
 }
 
 static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
@@ -439,7 +449,7 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
 }
 
 static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
@@ -698,90 +708,100 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
                                    int stride, int tx_type, int bd) {
   __m128i in[16], out[16];
-  const TXFM_2D_CFG *cfg = NULL;
+  const TXFM_1D_CFG *row_cfg = NULL;
+  const TXFM_1D_CFG *col_cfg = NULL;
 
   switch (tx_type) {
     case DCT_DCT:
-      cfg = &inv_txfm_2d_cfg_dct_dct_8;
+      row_cfg = &inv_txfm_1d_row_cfg_dct_8;
+      col_cfg = &inv_txfm_1d_col_cfg_dct_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
     case DCT_ADST:
-      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+      col_cfg = &inv_txfm_1d_col_cfg_dct_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
     case ADST_DCT:
-      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      row_cfg = &inv_txfm_1d_row_cfg_dct_8;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
     case ADST_ADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      row_cfg = &inv_txfm_1d_row_cfg_dct_8;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
       break;
     case DCT_FLIPADST:
-      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+      col_cfg = &inv_txfm_1d_col_cfg_dct_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
       break;
     case ADST_FLIPADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_8x8(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_8x8(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
       break;
     case FLIPADST_ADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
       break;
 #endif  // CONFIG_EXT_TX
     default: assert(0);
@@ -849,7 +869,7 @@ static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
 }
 
 static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
@@ -907,24 +927,24 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[6] = u[6];
     v[7] = u[7];
 
-    v[8] = half_btf_sse4_1(cospi60, u[8], cospim4, u[15], rnding, bit);
-    v[9] = half_btf_sse4_1(cospi28, u[9], cospim36, u[14], rnding, bit);
-    v[10] = half_btf_sse4_1(cospi44, u[10], cospim20, u[13], rnding, bit);
-    v[11] = half_btf_sse4_1(cospi12, u[11], cospim52, u[12], rnding, bit);
-    v[12] = half_btf_sse4_1(cospi52, u[11], cospi12, u[12], rnding, bit);
-    v[13] = half_btf_sse4_1(cospi20, u[10], cospi44, u[13], rnding, bit);
-    v[14] = half_btf_sse4_1(cospi36, u[9], cospi28, u[14], rnding, bit);
-    v[15] = half_btf_sse4_1(cospi4, u[8], cospi60, u[15], rnding, bit);
+    v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
 
     // stage 3
     u[0] = v[0];
     u[1] = v[1];
     u[2] = v[2];
     u[3] = v[3];
-    u[4] = half_btf_sse4_1(cospi56, v[4], cospim8, v[7], rnding, bit);
-    u[5] = half_btf_sse4_1(cospi24, v[5], cospim40, v[6], rnding, bit);
-    u[6] = half_btf_sse4_1(cospi40, v[5], cospi24, v[6], rnding, bit);
-    u[7] = half_btf_sse4_1(cospi8, v[4], cospi56, v[7], rnding, bit);
+    u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+    u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
     u[8] = _mm_add_epi32(v[8], v[9]);
     u[9] = _mm_sub_epi32(v[8], v[9]);
     u[10] = _mm_sub_epi32(v[11], v[10]);
@@ -945,19 +965,19 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[1] = _mm_add_epi32(v[1], rnding);
     v[1] = _mm_srai_epi32(v[1], bit);
 
-    v[2] = half_btf_sse4_1(cospi48, u[2], cospim16, u[3], rnding, bit);
-    v[3] = half_btf_sse4_1(cospi16, u[2], cospi48, u[3], rnding, bit);
+    v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
     v[4] = _mm_add_epi32(u[4], u[5]);
     v[5] = _mm_sub_epi32(u[4], u[5]);
     v[6] = _mm_sub_epi32(u[7], u[6]);
     v[7] = _mm_add_epi32(u[6], u[7]);
     v[8] = u[8];
-    v[9] = half_btf_sse4_1(cospim16, u[9], cospi48, u[14], rnding, bit);
-    v[10] = half_btf_sse4_1(cospim48, u[10], cospim16, u[13], rnding, bit);
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
     v[11] = u[11];
     v[12] = u[12];
-    v[13] = half_btf_sse4_1(cospim16, u[10], cospi48, u[13], rnding, bit);
-    v[14] = half_btf_sse4_1(cospi48, u[9], cospi16, u[14], rnding, bit);
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
     v[15] = u[15];
 
     // stage 5
@@ -1043,7 +1063,7 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
 }
 
 static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
@@ -1183,18 +1203,18 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[1] = u[1];
     v[2] = u[2];
     v[3] = u[3];
-    v[4] = half_btf_sse4_1(cospi16, u[4], cospi48, u[5], rnding, bit);
-    v[5] = half_btf_sse4_1(cospi48, u[4], cospim16, u[5], rnding, bit);
-    v[6] = half_btf_sse4_1(cospim48, u[6], cospi16, u[7], rnding, bit);
-    v[7] = half_btf_sse4_1(cospi16, u[6], cospi48, u[7], rnding, bit);
+    v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
     v[8] = u[8];
     v[9] = u[9];
     v[10] = u[10];
     v[11] = u[11];
-    v[12] = half_btf_sse4_1(cospi16, u[12], cospi48, u[13], rnding, bit);
-    v[13] = half_btf_sse4_1(cospi48, u[12], cospim16, u[13], rnding, bit);
-    v[14] = half_btf_sse4_1(cospim48, u[14], cospi16, u[15], rnding, bit);
-    v[15] = half_btf_sse4_1(cospi16, u[14], cospi48, u[15], rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
 
     // stage 5
     u[0] = _mm_add_epi32(v[0], v[4]);
@@ -1223,14 +1243,14 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[5] = u[5];
     v[6] = u[6];
     v[7] = u[7];
-    v[8] = half_btf_sse4_1(cospi8, u[8], cospi56, u[9], rnding, bit);
-    v[9] = half_btf_sse4_1(cospi56, u[8], cospim8, u[9], rnding, bit);
-    v[10] = half_btf_sse4_1(cospi40, u[10], cospi24, u[11], rnding, bit);
-    v[11] = half_btf_sse4_1(cospi24, u[10], cospim40, u[11], rnding, bit);
-    v[12] = half_btf_sse4_1(cospim56, u[12], cospi8, u[13], rnding, bit);
-    v[13] = half_btf_sse4_1(cospi8, u[12], cospi56, u[13], rnding, bit);
-    v[14] = half_btf_sse4_1(cospim24, u[14], cospi40, u[15], rnding, bit);
-    v[15] = half_btf_sse4_1(cospi40, u[14], cospi24, u[15], rnding, bit);
+    v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
 
     // stage 7
     u[0] = _mm_add_epi32(v[0], v[8]);
@@ -1251,22 +1271,22 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     u[15] = _mm_sub_epi32(v[7], v[15]);
 
     // stage 8
-    v[0] = half_btf_sse4_1(cospi2, u[0], cospi62, u[1], rnding, bit);
-    v[1] = half_btf_sse4_1(cospi62, u[0], cospim2, u[1], rnding, bit);
-    v[2] = half_btf_sse4_1(cospi10, u[2], cospi54, u[3], rnding, bit);
-    v[3] = half_btf_sse4_1(cospi54, u[2], cospim10, u[3], rnding, bit);
-    v[4] = half_btf_sse4_1(cospi18, u[4], cospi46, u[5], rnding, bit);
-    v[5] = half_btf_sse4_1(cospi46, u[4], cospim18, u[5], rnding, bit);
-    v[6] = half_btf_sse4_1(cospi26, u[6], cospi38, u[7], rnding, bit);
-    v[7] = half_btf_sse4_1(cospi38, u[6], cospim26, u[7], rnding, bit);
-    v[8] = half_btf_sse4_1(cospi34, u[8], cospi30, u[9], rnding, bit);
-    v[9] = half_btf_sse4_1(cospi30, u[8], cospim34, u[9], rnding, bit);
-    v[10] = half_btf_sse4_1(cospi42, u[10], cospi22, u[11], rnding, bit);
-    v[11] = half_btf_sse4_1(cospi22, u[10], cospim42, u[11], rnding, bit);
-    v[12] = half_btf_sse4_1(cospi50, u[12], cospi14, u[13], rnding, bit);
-    v[13] = half_btf_sse4_1(cospi14, u[12], cospim50, u[13], rnding, bit);
-    v[14] = half_btf_sse4_1(cospi58, u[14], cospi6, u[15], rnding, bit);
-    v[15] = half_btf_sse4_1(cospi6, u[14], cospim58, u[15], rnding, bit);
+    v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+    v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+    v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+    v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+    v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
 
     // stage 9
     out[0 * 4 + col] = v[1];
@@ -1298,99 +1318,109 @@ static void round_shift_16x16(__m128i *in, int shift) {
 void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
                                      int stride, int tx_type, int bd) {
   __m128i in[64], out[64];
-  const TXFM_2D_CFG *cfg = NULL;
+  const TXFM_1D_CFG *row_cfg = NULL;
+  const TXFM_1D_CFG *col_cfg = NULL;
 
   switch (tx_type) {
     case DCT_DCT:
-      cfg = &inv_txfm_2d_cfg_dct_dct_16;
+      row_cfg = &inv_txfm_1d_row_cfg_dct_16;
+      col_cfg = &inv_txfm_1d_col_cfg_dct_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
-      round_shift_16x16(in, -cfg->shift[0]);
+      idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+      round_shift_16x16(in, -row_cfg->shift[0]);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
     case DCT_ADST:
-      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+      col_cfg = &inv_txfm_1d_col_cfg_dct_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
-      round_shift_16x16(in, -cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+      round_shift_16x16(in, -row_cfg->shift[0]);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
     case ADST_DCT:
-      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      row_cfg = &inv_txfm_1d_row_cfg_dct_16;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
-      round_shift_16x16(in, -cfg->shift[0]);
+      idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+      round_shift_16x16(in, -row_cfg->shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
     case ADST_ADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
-      round_shift_16x16(in, -cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+      round_shift_16x16(in, -row_cfg->shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      row_cfg = &inv_txfm_1d_row_cfg_dct_16;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
-      round_shift_16x16(in, -cfg->shift[0]);
+      idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+      round_shift_16x16(in, -row_cfg->shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
       break;
     case DCT_FLIPADST:
-      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+      col_cfg = &inv_txfm_1d_col_cfg_dct_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
-      round_shift_16x16(in, -cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+      round_shift_16x16(in, -row_cfg->shift[0]);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
       break;
     case ADST_FLIPADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
-      round_shift_16x16(in, -cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+      round_shift_16x16(in, -row_cfg->shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
-      round_shift_16x16(in, -cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+      round_shift_16x16(in, -row_cfg->shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_16x16(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_16x16(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
       break;
     case FLIPADST_ADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
-      round_shift_16x16(in, -cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+      round_shift_16x16(in, -row_cfg->shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
-      write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+      write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
       break;
 #endif
     default: assert(0);
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
index bc96defe3..fb246674a 100644
--- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -77,14 +77,15 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
 
 // Note:
 //  rounding = 1 << (bit - 1)
-static INLINE __m128i half_btf_sse4_1(__m128i w0, __m128i n0, __m128i w1,
-                                      __m128i n1, __m128i rounding, int bit) {
+static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
+                                      const __m128i *w1, const __m128i *n1,
+                                      const __m128i *rounding, int bit) {
   __m128i x, y;
 
-  x = _mm_mullo_epi32(w0, n0);
-  y = _mm_mullo_epi32(w1, n1);
+  x = _mm_mullo_epi32(*w0, *n0);
+  y = _mm_mullo_epi32(*w1, *n1);
   x = _mm_add_epi32(x, y);
-  x = _mm_add_epi32(x, rounding);
+  x = _mm_add_epi32(x, *rounding);
   x = _mm_srai_epi32(x, bit);
   return x;
 }
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
index c25db88b7..37e2f61e7 100644
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -14,16 +14,13 @@
 #include "./av1_rtcd.h"
 #include "av1/common/warped_motion.h"
 
-static const __m128i *const filter = (const __m128i *const)warped_filter;
-
-/* SSE2 version of the rotzoom/affine warp filter */
-void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
-                                  int height, int stride, uint16_t *pred,
-                                  int p_col, int p_row, int p_width,
-                                  int p_height, int p_stride, int subsampling_x,
-                                  int subsampling_y, int bd, int ref_frm,
-                                  int16_t alpha, int16_t beta, int16_t gamma,
-                                  int16_t delta) {
+void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
+                                  int width, int height, int stride,
+                                  uint16_t *pred, int p_col, int p_row,
+                                  int p_width, int p_height, int p_stride,
+                                  int subsampling_x, int subsampling_y, int bd,
+                                  int comp_avg, int16_t alpha, int16_t beta,
+                                  int16_t gamma, int16_t delta) {
 #if HORSHEAR_REDUCE_PREC_BITS >= 5
   __m128i tmp[15];
 #else
@@ -47,23 +44,21 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
     for (j = 0; j < p_width; j += 8) {
       // (x, y) coordinates of the center of this block in the destination
       // image
-      int32_t dst_x = p_col + j + 4;
-      int32_t dst_y = p_row + i + 4;
+      const int32_t dst_x = p_col + j + 4;
+      const int32_t dst_y = p_row + i + 4;
 
       int32_t x4, y4, ix4, sx4, iy4, sy4;
       if (subsampling_x)
-        x4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
-                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
       else
         x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
 
       if (subsampling_y)
-        y4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
-                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
       else
         y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
 
@@ -72,71 +67,103 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
       iy4 = y4 >> WARPEDMODEL_PREC_BITS;
       sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
+      // Add in all the constant terms, including rounding and offset
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
       // Horizontal filter
-      for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-        int iy = iy4 + k;
-        if (iy < 0)
-          iy = 0;
-        else if (iy > height - 1)
-          iy = height - 1;
-
-        // If the block is aligned such that, after clamping, every sample
-        // would be taken from the leftmost/rightmost column, then we can
-        // skip the expensive horizontal filter.
-        if (ix4 <= -7) {
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
               ref[iy * stride] *
-              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        } else if (ix4 >= width + 6) {
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
               ref[iy * stride + (width - 1)] *
-              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        } else {
-          int sx = sx4 + alpha * (-4) + beta * k +
-                   // Include rounding and offset here
-                   (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-                   (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        }
+      } else {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
 
           // Load source pixels
-          __m128i src =
+          const __m128i src =
               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          __m128i src2 =
+          const __m128i src2 =
               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
 
           // Filter even-index pixels
-          __m128i tmp_0 = filter[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS];
-          __m128i tmp_2 = filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS];
-          __m128i tmp_4 = filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS];
-          __m128i tmp_6 = filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS];
+          const __m128i tmp_0 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_2 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_4 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_6 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
 
           // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
-          __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+          const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
           // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
-          __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+          const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
           // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
-          __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+          const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
           // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
-          __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+          const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
 
           // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
-          __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+          const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
           // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
-          __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+          const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
           // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
-          __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+          const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
           // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
-          __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+          const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
-          __m128i round_const =
-              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+          const __m128i round_const =
+              _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
+                             ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
 
           // Calculate filtered results
-          __m128i res_0 = _mm_madd_epi16(src, coeff_0);
-          __m128i res_2 =
+          const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
+          const __m128i res_2 =
               _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
-          __m128i res_4 =
+          const __m128i res_4 =
               _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
-          __m128i res_6 =
+          const __m128i res_6 =
               _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
 
           __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
@@ -145,28 +172,36 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
                                     HORSHEAR_REDUCE_PREC_BITS);
 
           // Filter odd-index pixels
-          __m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
-          __m128i tmp_3 = filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS];
-          __m128i tmp_5 = filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS];
-          __m128i tmp_7 = filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS];
-
-          __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-          __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-          __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-          __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-          __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-          __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-          __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-          __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-          __m128i res_1 =
+          const __m128i tmp_1 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_3 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_5 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_7 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+          const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+          const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+          const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+          const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+          const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+          const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+          const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+          const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+          const __m128i res_1 =
               _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
-          __m128i res_3 =
+          const __m128i res_3 =
               _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
-          __m128i res_5 =
+          const __m128i res_5 =
               _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
-          __m128i res_7 =
+          const __m128i res_7 =
               _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
 
           __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
@@ -183,101 +218,118 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
 
       // Vertical filter
       for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + gamma * (-4) + delta * k +
-                 (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-                 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+        int sy = sy4 + delta * (k + 4);
 
         // Load from tmp and rearrange pairs of consecutive rows into the
         // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
-        __m128i *src = tmp + (k + 4);
-        __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
-        __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
-        __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
-        __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+        const __m128i *src = tmp + (k + 4);
+        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
 
         // Filter even-index pixels
-        __m128i tmp_0 = filter[(sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS];
-        __m128i tmp_2 = filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS];
-        __m128i tmp_4 = filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS];
-        __m128i tmp_6 = filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS];
-
-        __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-        __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-        __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-        __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-        __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-        __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-        __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-        __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-        __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-        __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-        __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-        __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                         _mm_add_epi32(res_4, res_6));
+        const __m128i tmp_0 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_2 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_4 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_6 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
 
         // Filter odd-index pixels
-        __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
-        __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
-        __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
-        __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
-        __m128i tmp_1 = filter[(sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS];
-        __m128i tmp_3 = filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS];
-        __m128i tmp_5 = filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS];
-        __m128i tmp_7 = filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS];
-
-        __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-        __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-        __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-        __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-        __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-        __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-        __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-        __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-        __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-        __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-        __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-        __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                        _mm_add_epi32(res_5, res_7));
+        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        const __m128i tmp_1 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_3 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_5 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_7 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
 
         // Rearrange pixels back into the order 0 ... 7
-        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
 
         // Round and pack into 8 bits
-        __m128i round_const =
-            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+        const __m128i round_const =
+            _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+                           ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
 
-        __m128i res_lo_round = _mm_srai_epi32(
+        const __m128i res_lo_round = _mm_srai_epi32(
             _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
-        __m128i res_hi_round = _mm_srai_epi32(
+        const __m128i res_hi_round = _mm_srai_epi32(
             _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
 
         __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
         // Clamp res_16bit to the range [0, 2^bd - 1]
-        __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
-        __m128i zero = _mm_setzero_si128();
+        const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+        const __m128i zero = _mm_setzero_si128();
         res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
 
         // Store, blending with 'pred' if needed
-        __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+        __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
 
         // Note: If we're outputting a 4x4 block, we need to be very careful
         // to only output 4 pixels at this point, to avoid encode/decode
         // mismatches when encoding with multiple threads.
         if (p_width == 4) {
-          if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
+          if (comp_avg)
+            res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
           _mm_storel_epi64(p, res_16bit);
         } else {
-          if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
+          if (comp_avg)
+            res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
           _mm_storeu_si128(p, res_16bit);
         }
       }
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
index efc8d1e24..c69614e42 100644
--- a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -14,67 +14,9 @@
 #include "./aom_config.h"
 #include "./av1_rtcd.h"
 
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
-#if CONFIG_HIGHBITDEPTH
-  *in = _mm256_setr_epi16(
-      (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
-      (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
-      (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
-      (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
-      (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
-      (int16_t)coeff[15]);
-#else
-  *in = _mm256_loadu_si256((const __m256i *)coeff);
-#endif
-}
-
-static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
-  int i = 0;
-  while (i < 16) {
-    load_coeff(coeff + (i << 4), &in[i]);
-    i += 1;
-  }
-}
-
-static void recon_and_store(const __m256i *res, uint8_t *output) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i x = _mm_loadu_si128((__m128i const *)output);
-  __m128i p0 = _mm_unpacklo_epi8(x, zero);
-  __m128i p1 = _mm_unpackhi_epi8(x, zero);
-
-  p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
-  p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
-  x = _mm_packus_epi16(p0, p1);
-  _mm_storeu_si128((__m128i *)output, x);
-}
-
-#define IDCT_ROUNDING_POS (6)
-
-static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
-  const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
-  int i = 0;
-
-  while (i < 16) {
-    in[i] = _mm256_add_epi16(in[i], rounding);
-    in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
-    recon_and_store(&in[i], output + i * stride);
-    i += 1;
-  }
-}
-
-static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
-                                     const __m256i *c0, const __m256i *c1,
-                                     __m256i *b0, __m256i *b1) {
-  __m256i x0, x1;
-  x0 = _mm256_unpacklo_epi16(*a0, *a1);
-  x1 = _mm256_unpackhi_epi16(*a0, *a1);
-  *b0 = butter_fly(x0, x1, *c0);
-  *b1 = butter_fly(x0, x1, *c1);
-}
+#include "aom_dsp/x86/inv_txfm_common_avx2.h"
 
-static void idct16_avx2(__m256i *in) {
+void av1_idct16_avx2(__m256i *in) {
   const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
   const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
   const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
@@ -216,8 +158,8 @@ static void idct16_avx2(__m256i *in) {
 }
 
 static void idct16(__m256i *in) {
-  mm256_transpose_16x16(in);
-  idct16_avx2(in);
+  mm256_transpose_16x16(in, in);
+  av1_idct16_avx2(in);
 }
 
 static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
@@ -398,7 +340,7 @@ static void iadst16_avx2(__m256i *in) {
 }
 
 static void iadst16(__m256i *in) {
-  mm256_transpose_16x16(in);
+  mm256_transpose_16x16(in, in);
   iadst16_avx2(in);
 }
 
@@ -416,8 +358,8 @@ static void flip_col(uint8_t **dest, int *stride, int rows) {
 }
 
 static void iidtx16(__m256i *in) {
-  mm256_transpose_16x16(in);
-  txfm_scaling16_avx2(Sqrt2, in);
+  mm256_transpose_16x16(in, in);
+  txfm_scaling16_avx2((int16_t)Sqrt2, in);
 }
 #endif
 
@@ -503,5 +445,5 @@ void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
 #endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
-  write_buffer_16x16(in, stride, dest);
+  store_buffer_16xN(in, stride, dest, 16);
 }
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
index 522e8988c..d6a598746 100644
--- a/third_party/aom/av1/common/x86/idct_intrin_sse2.c
+++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
@@ -17,14 +17,14 @@
 #include "av1/common/enums.h"
 
 #if CONFIG_EXT_TX
-static INLINE void fliplr_4x4(__m128i in[2]) {
+static INLINE void fliplr_4x4(__m128i *in /*in[2]*/) {
   in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
   in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
   in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
   in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
 }
 
-static INLINE void fliplr_8x8(__m128i in[8]) {
+static INLINE void fliplr_8x8(__m128i *in /*in[8]*/) {
   in[0] = mm_reverse_epi16(in[0]);
   in[1] = mm_reverse_epi16(in[1]);
   in[2] = mm_reverse_epi16(in[2]);
@@ -36,7 +36,7 @@ static INLINE void fliplr_8x8(__m128i in[8]) {
   in[7] = mm_reverse_epi16(in[7]);
 }
 
-static INLINE void fliplr_16x8(__m128i in[16]) {
+static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) {
   fliplr_8x8(&in[0]);
   fliplr_8x8(&in[8]);
 }
@@ -356,7 +356,7 @@ static void iidtx8_sse2(__m128i *in) {
 }
 
 static INLINE void iidtx4_sse2(__m128i *in) {
-  const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
+  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
 
   const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
   const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
index 925e4650d..cdc4e8d0f 100644
--- a/third_party/aom/av1/common/x86/warp_plane_sse2.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -14,17 +14,15 @@
 #include "./av1_rtcd.h"
 #include "av1/common/warped_motion.h"
 
-static const __m128i *const filter = (const __m128i *const)warped_filter;
-
-/* SSE2 version of the rotzoom/affine warp filter */
-void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
-                          int stride, uint8_t *pred, int p_col, int p_row,
-                          int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y, int ref_frm,
+void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
+                          int height, int stride, uint8_t *pred, int p_col,
+                          int p_row, int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y, int comp_avg,
                           int16_t alpha, int16_t beta, int16_t gamma,
                           int16_t delta) {
   __m128i tmp[15];
   int i, j, k;
+  const int bd = 8;
 
   /* Note: For this code to work, the left/right frame borders need to be
      extended by at least 13 pixels each. By the time we get here, other
@@ -42,23 +40,21 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
     for (j = 0; j < p_width; j += 8) {
       // (x, y) coordinates of the center of this block in the destination
       // image
-      int32_t dst_x = p_col + j + 4;
-      int32_t dst_y = p_row + i + 4;
+      const int32_t dst_x = p_col + j + 4;
+      const int32_t dst_y = p_row + i + 4;
 
       int32_t x4, y4, ix4, sx4, iy4, sy4;
       if (subsampling_x)
-        x4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
-                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
       else
         x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
 
       if (subsampling_y)
-        y4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
-                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
       else
         y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
 
@@ -67,76 +63,104 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
       iy4 = y4 >> WARPEDMODEL_PREC_BITS;
       sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
+      // Add in all the constant terms, including rounding and offset
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
       // Horizontal filter
-      for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-        int iy = iy4 + k;
-        if (iy < 0)
-          iy = 0;
-        else if (iy > height - 1)
-          iy = height - 1;
-
-        // If the block is aligned such that, after clamping, every sample
-        // would be taken from the leftmost/rightmost column, then we can
-        // skip the expensive horizontal filter.
-        if (ix4 <= -7) {
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
               ref[iy * stride] *
-              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        } else if (ix4 >= width + 6) {
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
               ref[iy * stride + (width - 1)] *
-              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        } else {
-          int sx = sx4 + alpha * (-4) + beta * k +
-                   // Include rounding and offset here
-                   (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-                   (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        }
+      } else {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
 
           // Load source pixels
-          __m128i zero = _mm_setzero_si128();
-          __m128i src =
+          const __m128i zero = _mm_setzero_si128();
+          const __m128i src =
               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
 
           // Filter even-index pixels
-          __m128i tmp_0 = _mm_loadu_si128(
-              (__m128i *)(filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          __m128i tmp_2 = _mm_loadu_si128(
-              (__m128i *)(filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          __m128i tmp_4 = _mm_loadu_si128(
-              (__m128i *)(filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          __m128i tmp_6 = _mm_loadu_si128(
-              (__m128i *)(filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_0 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_2 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_4 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_6 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
 
           // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
-          __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+          const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
           // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
-          __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+          const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
           // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
-          __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+          const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
           // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
-          __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+          const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
 
           // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
-          __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+          const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
           // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
-          __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+          const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
           // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
-          __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+          const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
           // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
-          __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+          const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
-          __m128i round_const =
-              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+          const __m128i round_const =
+              _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
+                             ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
 
           // Calculate filtered results
-          __m128i src_0 = _mm_unpacklo_epi8(src, zero);
-          __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-          __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
-          __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-          __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
-          __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-          __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
-          __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+          const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
+          const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+          const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
+          const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+          const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
+          const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+          const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
+          const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
 
           __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                            _mm_add_epi32(res_2, res_6));
@@ -144,33 +168,37 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
                                     HORSHEAR_REDUCE_PREC_BITS);
 
           // Filter odd-index pixels
-          __m128i tmp_1 = _mm_loadu_si128(
-              (__m128i *)(filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          __m128i tmp_3 = _mm_loadu_si128(
-              (__m128i *)(filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          __m128i tmp_5 = _mm_loadu_si128(
-              (__m128i *)(filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          __m128i tmp_7 = _mm_loadu_si128(
-              (__m128i *)(filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
-          __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-          __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-          __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-          __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-          __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-          __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-          __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-          __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-          __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
-          __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-          __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
-          __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-          __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
-          __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-          __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
-          __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+          const __m128i tmp_1 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_3 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_5 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          const __m128i tmp_7 = _mm_loadu_si128(
+              (__m128i *)(warped_filter +
+                          ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+          const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+          const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+          const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+          const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+          const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+          const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+          const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+          const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+          const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
+          const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+          const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
+          const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+          const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
+          const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+          const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
+          const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
 
           __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                           _mm_add_epi32(res_3, res_7));
@@ -186,109 +214,116 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
 
       // Vertical filter
       for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + gamma * (-4) + delta * k +
-                 (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-                 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+        int sy = sy4 + delta * (k + 4);
 
         // Load from tmp and rearrange pairs of consecutive rows into the
         // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
-        __m128i *src = tmp + (k + 4);
-        __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
-        __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
-        __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
-        __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+        const __m128i *src = tmp + (k + 4);
+        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
 
         // Filter even-index pixels
-        __m128i tmp_0 = _mm_loadu_si128(
-            (__m128i *)(filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        __m128i tmp_2 = _mm_loadu_si128(
-            (__m128i *)(filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        __m128i tmp_4 = _mm_loadu_si128(
-            (__m128i *)(filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        __m128i tmp_6 = _mm_loadu_si128(
-            (__m128i *)(filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-        __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-        __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-        __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-        __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-        __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-        __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-        __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-        __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-        __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-        __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-        __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                         _mm_add_epi32(res_4, res_6));
+        const __m128i tmp_0 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_2 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_4 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_6 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
 
         // Filter odd-index pixels
-        __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
-        __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
-        __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
-        __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
-        __m128i tmp_1 = _mm_loadu_si128(
-            (__m128i *)(filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        __m128i tmp_3 = _mm_loadu_si128(
-            (__m128i *)(filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        __m128i tmp_5 = _mm_loadu_si128(
-            (__m128i *)(filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        __m128i tmp_7 = _mm_loadu_si128(
-            (__m128i *)(filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-        __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-        __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-        __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-        __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-        __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-        __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-        __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-        __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-        __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-        __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-        __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                        _mm_add_epi32(res_5, res_7));
+        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        const __m128i tmp_1 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_3 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_5 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_7 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
 
         // Rearrange pixels back into the order 0 ... 7
-        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
 
         // Round and pack into 8 bits
-        __m128i round_const =
-            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+        const __m128i round_const =
+            _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+                           ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
 
-        __m128i res_lo_round = _mm_srai_epi32(
+        const __m128i res_lo_round = _mm_srai_epi32(
             _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
-        __m128i res_hi_round = _mm_srai_epi32(
+        const __m128i res_hi_round = _mm_srai_epi32(
             _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
 
-        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
         __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
 
         // Store, blending with 'pred' if needed
-        __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+        __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
 
         // Note: If we're outputting a 4x4 block, we need to be very careful
         // to only output 4 pixels at this point, to avoid encode/decode
         // mismatches when encoding with multiple threads.
         if (p_width == 4) {
-          if (ref_frm) {
+          if (comp_avg) {
             const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
             res_8bit = _mm_avg_epu8(res_8bit, orig);
           }
           *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
         } else {
-          if (ref_frm) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
+          if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
           _mm_storel_epi64(p, res_8bit);
         }
       }
diff --git a/third_party/aom/av1/common/x86/warp_plane_ssse3.c b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
new file mode 100644
index 000000000..494410e99
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+   * Each coefficient is stored in 8 bits instead of 16 bits
+   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+     This is done in order to avoid overflow: Since the tap with the largest
+     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+     convolve functions.
+
+     Instead, we use the summation order
+     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+     The rearrangement of coefficients in this table is so that we can get the
+     coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+                filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
+  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
+  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
+  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
+  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
+  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
+  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
+  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
+  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
+  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
+  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
+  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
+  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
+  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
+  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
+  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
+  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
+  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
+  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
+  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
+  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
+  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
+  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
+  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
+  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
+  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
+  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
+  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
+  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
+  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
+  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
+  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
+  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
+  // [1, 2)
+  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
+  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
+  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
+  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
+  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
+  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
+  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
+  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
+  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
+  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
+  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
+  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
+  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
+  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
+  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
+  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
+  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
+  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
+  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
+  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
+  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
+  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
+  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
+  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
+  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
+  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
+  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
+  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
+  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
+  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
+  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
+  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
+  // dummy (replicate row index 191)
+  { 0, 0,   2,  -1, 0,   0, 127, 0},
+
+#else
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
+  // [1, 2)
+  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
+  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
+  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
+  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
+  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
+  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
+  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
+  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
+  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
+  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
+  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
+  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
+  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
+  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
+  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
+  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
+  // dummy (replicate row index 95)
+  { 0, 0,   4,  -3, 0,  -1, 127, 1},
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
+// in an SSE register into two sequences:
+// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
+// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
+static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
+                                       8, 10, 10, 12, 12, 14, 14, 0 };
+static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
+                                      9, 11, 11, 13, 13, 15, 15, 0 };
+
+void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
+                           int height, int stride, uint8_t *pred, int p_col,
+                           int p_row, int p_width, int p_height, int p_stride,
+                           int subsampling_x, int subsampling_y, int comp_avg,
+                           int16_t alpha, int16_t beta, int16_t gamma,
+                           int16_t delta) {
+  __m128i tmp[15];
+  int i, j, k;
+  const int bd = 8;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      // (x, y) coordinates of the center of this block in the destination
+      // image
+      const int32_t dst_x = p_col + j + 4;
+      const int32_t dst_y = p_row + i + 4;
+
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
+      else
+        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
+
+      if (subsampling_y)
+        y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
+      else
+        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
+              ref[iy * stride] *
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
+              ref[iy * stride + (width - 1)] *
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        }
+      } else {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          const __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          const __m128i src_even =
+              _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+          const __m128i src_odd =
+              _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+
+          // Filter even-index pixels
+          const __m128i tmp_0 = _mm_loadl_epi64((
+              __m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+          const __m128i tmp_1 = _mm_loadl_epi64((
+              __m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+          const __m128i tmp_2 = _mm_loadl_epi64((
+              __m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+          const __m128i tmp_3 = _mm_loadl_epi64((
+              __m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+          const __m128i tmp_4 = _mm_loadl_epi64((
+              __m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+          const __m128i tmp_5 = _mm_loadl_epi64((
+              __m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+          const __m128i tmp_6 = _mm_loadl_epi64((
+              __m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+          const __m128i tmp_7 = _mm_loadl_epi64((
+              __m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
+          const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
+          const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
+          const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
+          const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+          // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
+          const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+          // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
+          const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+          // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
+          const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+          // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
+          const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+          // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+          const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+          // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+          const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+          // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+          const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+          // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+          const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+          // The pixel order we need for 'src' is:
+          // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+          const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+          const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
+          // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+          const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+                                                    _mm_srli_si128(src_odd, 4));
+          const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
+          // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+          const __m128i src_13 =
+              _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+          const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
+          // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+          const __m128i src_57 = _mm_unpacklo_epi64(
+              _mm_srli_si128(src_odd, 4), _mm_srli_si128(src_even, 6));
+          const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
+
+          const __m128i round_const =
+              _mm_set1_epi16((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
+                             ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
+
+          // Note: The values res_02 + res_46 and res_13 + res_57 both
+          // fit into int16s at this point, but their sum may be too wide to fit
+          // into an int16. However, once we also add round_const, the sum of
+          // all of these fits into a uint16.
+          //
+          // The wrapping behaviour of _mm_add_* is used here to make sure we
+          // get the correct result despite converting between different
+          // (implicit) types.
+          const __m128i res_even = _mm_add_epi16(res_02, res_46);
+          const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+          const __m128i res =
+              _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+          tmp[k + 7] = _mm_srli_epi16(res, HORSHEAR_REDUCE_PREC_BITS);
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+
+        // Load from tmp and rearrange pairs of consecutive rows into the
+        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+        const __m128i *src = tmp + (k + 4);
+        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+        // Filter even-index pixels
+        const __m128i tmp_0 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_2 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_4 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_6 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        const __m128i tmp_1 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_3 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_5 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_7 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        // Round and pack into 8 bits
+        const __m128i round_const =
+            _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+                           ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+
+        const __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+        const __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+        // Store, blending with 'pred' if needed
+        __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+        // Note: If we're outputting a 4x4 block, we need to be very careful
+        // to only output 4 pixels at this point, to avoid encode/decode
+        // mismatches when encoding with multiple threads.
+        if (p_width == 4) {
+          if (comp_avg) {
+            const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
+            res_8bit = _mm_avg_epu8(res_8bit, orig);
+          }
+          *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+        } else {
+          if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
+          _mm_storel_epi64(p, res_8bit);
+        }
+      }
+    }
+  }
+}
author	trav90 <travawine@palemoon.org>	2018-10-17 05:59:08 -0500
committer	trav90 <travawine@palemoon.org>	2018-10-17 05:59:08 -0500
commit	df9477dfa60ebb5d31bc142e58ce46535c17abce (patch)
tree	c4fdd5d1b09d08c0514f208246260fc87372cb56 /third_party/aom/av1/common/x86
parent	0cc51bc106250988cc3b89cb5d743a5af52cd35a (diff)
download	UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.gz UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.lz UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.xz UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.zip